summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/log/log.c
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2019-08-21 05:23:37 +0000
committerevergreen <evergreen@mongodb.com>2019-08-21 05:23:37 +0000
commitac41c65f6355f83aac70136324c98561ac79daa1 (patch)
treea7c3f7ef090b59c6a06838a02c96bd1d49e1c729 /src/third_party/wiredtiger/src/log/log.c
parentf54709196711c63a429b71f47c584661286d675f (diff)
downloadmongo-ac41c65f6355f83aac70136324c98561ac79daa1.tar.gz
Import wiredtiger: 7dfd9391862bc9a6d84868c4dc51689c45a3aacf from branch mongodb-4.4
ref: c809757d8b..7dfd939186 for: 4.3.1 WT-4658 Apply Clang Format WT-4810 Adding WT_ERR_ASSERT and WT_RET_ASSERT macros WT-5046 Prepared transactions aren't properly cleared from global table with WT_CONN_LOG_DEBUG_MODE enabled
Diffstat (limited to 'src/third_party/wiredtiger/src/log/log.c')
-rw-r--r--src/third_party/wiredtiger/src/log/log.c5116
1 files changed, 2429 insertions, 2687 deletions
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index 6361177c193..d6f18f82bb9 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -11,3079 +11,2821 @@
static int __log_newfile(WT_SESSION_IMPL *, bool, bool *);
static int __log_openfile(WT_SESSION_IMPL *, uint32_t, uint32_t, WT_FH **);
static int __log_truncate(WT_SESSION_IMPL *, WT_LSN *, bool, bool);
-static int __log_write_internal(
- WT_SESSION_IMPL *, WT_ITEM *, WT_LSN *, uint32_t);
+static int __log_write_internal(WT_SESSION_IMPL *, WT_ITEM *, WT_LSN *, uint32_t);
-#define WT_LOG_COMPRESS_SKIP (offsetof(WT_LOG_RECORD, record))
-#define WT_LOG_ENCRYPT_SKIP (offsetof(WT_LOG_RECORD, record))
+#define WT_LOG_COMPRESS_SKIP (offsetof(WT_LOG_RECORD, record))
+#define WT_LOG_ENCRYPT_SKIP (offsetof(WT_LOG_RECORD, record))
/* AUTOMATIC FLAG VALUE GENERATION START */
-#define WT_LOG_OPEN_CREATE_OK 0x1u /* Flag to __log_openfile() */
-/* AUTOMATIC FLAG VALUE GENERATION STOP */
+#define WT_LOG_OPEN_CREATE_OK 0x1u /* Flag to __log_openfile() */
+ /* AUTOMATIC FLAG VALUE GENERATION STOP */
/*
* __wt_log_printf --
- * Write a text message to the log.
+ * Write a text message to the log.
*/
int
__wt_log_printf(WT_SESSION_IMPL *session, const char *format, ...)
{
- WT_DECL_RET;
- va_list ap;
+ WT_DECL_RET;
+ va_list ap;
- va_start(ap, format);
- ret = __wt_log_vprintf(session, format, ap);
- va_end(ap);
- return (ret);
+ va_start(ap, format);
+ ret = __wt_log_vprintf(session, format, ap);
+ va_end(ap);
+ return (ret);
}
/*
* __log_checksum_match --
- * Given a log record, return whether the checksum matches.
+ * Given a log record, return whether the checksum matches.
*/
static bool
__log_checksum_match(WT_ITEM *buf, uint32_t reclen)
{
- WT_LOG_RECORD *logrec;
- uint32_t checksum_saved, checksum_tmp;
- bool checksum_matched;
+ WT_LOG_RECORD *logrec;
+ uint32_t checksum_saved, checksum_tmp;
+ bool checksum_matched;
- logrec = buf->mem;
- checksum_saved = checksum_tmp = logrec->checksum;
+ logrec = buf->mem;
+ checksum_saved = checksum_tmp = logrec->checksum;
#ifdef WORDS_BIGENDIAN
- checksum_tmp = __wt_bswap32(checksum_tmp);
+ checksum_tmp = __wt_bswap32(checksum_tmp);
#endif
- logrec->checksum = 0;
- checksum_matched = __wt_checksum_match(logrec, reclen, checksum_tmp);
- logrec->checksum = checksum_saved;
- return (checksum_matched);
+ logrec->checksum = 0;
+ checksum_matched = __wt_checksum_match(logrec, reclen, checksum_tmp);
+ logrec->checksum = checksum_saved;
+ return (checksum_matched);
}
/*
* __log_get_files --
- * Retrieve the list of all log-related files of the given prefix type.
+ * Retrieve the list of all log-related files of the given prefix type.
*/
static int
-__log_get_files(WT_SESSION_IMPL *session,
- const char *file_prefix, char ***filesp, u_int *countp)
+__log_get_files(WT_SESSION_IMPL *session, const char *file_prefix, char ***filesp, u_int *countp)
{
- WT_CONNECTION_IMPL *conn;
- const char *log_path;
-
- *countp = 0;
- *filesp = NULL;
-
- conn = S2C(session);
- log_path = conn->log_path;
- if (log_path == NULL)
- log_path = "";
- return (__wt_fs_directory_list(
- session, log_path, file_prefix, filesp, countp));
+ WT_CONNECTION_IMPL *conn;
+ const char *log_path;
+
+ *countp = 0;
+ *filesp = NULL;
+
+ conn = S2C(session);
+ log_path = conn->log_path;
+ if (log_path == NULL)
+ log_path = "";
+ return (__wt_fs_directory_list(session, log_path, file_prefix, filesp, countp));
}
/*
* __log_get_files_single --
- * Retrieve a single log-related file of the given prefix type.
+ * Retrieve a single log-related file of the given prefix type.
*/
static int
-__log_get_files_single(WT_SESSION_IMPL *session,
- const char *file_prefix, char ***filesp, u_int *countp)
+__log_get_files_single(
+ WT_SESSION_IMPL *session, const char *file_prefix, char ***filesp, u_int *countp)
{
- WT_CONNECTION_IMPL *conn;
- const char *log_path;
-
- *countp = 0;
- *filesp = NULL;
-
- conn = S2C(session);
- log_path = conn->log_path;
- if (log_path == NULL)
- log_path = "";
- return (__wt_fs_directory_list_single(
- session, log_path, file_prefix, filesp, countp));
+ WT_CONNECTION_IMPL *conn;
+ const char *log_path;
+
+ *countp = 0;
+ *filesp = NULL;
+
+ conn = S2C(session);
+ log_path = conn->log_path;
+ if (log_path == NULL)
+ log_path = "";
+ return (__wt_fs_directory_list_single(session, log_path, file_prefix, filesp, countp));
}
/*
* __log_prealloc_remove --
- * Remove all previously created pre-allocated files.
+ * Remove all previously created pre-allocated files.
*/
static int
__log_prealloc_remove(WT_SESSION_IMPL *session)
{
- WT_DECL_RET;
- WT_LOG *log;
- uint32_t lognum;
- u_int i, logcount;
- char **logfiles;
-
- logfiles = NULL;
- logcount = 0;
- log = S2C(session)->log;
- __wt_spin_lock(session, &log->log_fs_lock);
- /*
- * Clean up any old interim pre-allocated files. We clean
- * up these files because settings may have changed upon reboot
- * and we want those settings to take effect right away.
- */
- WT_ERR(__log_get_files(session,
- WT_LOG_TMPNAME, &logfiles, &logcount));
- for (i = 0; i < logcount; i++) {
- WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
- WT_ERR(__wt_log_remove(session, WT_LOG_TMPNAME, lognum));
- }
- WT_ERR(__wt_fs_directory_list_free(session, &logfiles, logcount));
- WT_ERR(__log_get_files(session,
- WT_LOG_PREPNAME, &logfiles, &logcount));
- for (i = 0; i < logcount; i++) {
- WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
- WT_ERR(__wt_log_remove(session, WT_LOG_PREPNAME, lognum));
- }
-err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
- __wt_spin_unlock(session, &log->log_fs_lock);
- return (ret);
+ WT_DECL_RET;
+ WT_LOG *log;
+ uint32_t lognum;
+ u_int i, logcount;
+ char **logfiles;
+
+ logfiles = NULL;
+ logcount = 0;
+ log = S2C(session)->log;
+ __wt_spin_lock(session, &log->log_fs_lock);
+ /*
+ * Clean up any old interim pre-allocated files. We clean up these files because settings may
+ * have changed upon reboot and we want those settings to take effect right away.
+ */
+ WT_ERR(__log_get_files(session, WT_LOG_TMPNAME, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ WT_ERR(__wt_log_remove(session, WT_LOG_TMPNAME, lognum));
+ }
+ WT_ERR(__wt_fs_directory_list_free(session, &logfiles, logcount));
+ WT_ERR(__log_get_files(session, WT_LOG_PREPNAME, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ WT_ERR(__wt_log_remove(session, WT_LOG_PREPNAME, lognum));
+ }
+err:
+ WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
+ __wt_spin_unlock(session, &log->log_fs_lock);
+ return (ret);
}
/*
* __log_wait_for_earlier_slot --
- * Wait for write_lsn to catch up to this slot.
+ * Wait for write_lsn to catch up to this slot.
*/
static void
__log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
- WT_CONNECTION_IMPL *conn;
- WT_LOG *log;
- int yield_count;
-
- conn = S2C(session);
- log = conn->log;
- yield_count = 0;
-
- while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) {
- /*
- * If we're on a locked path and the write LSN is not advancing,
- * unlock in case an earlier thread is trying to switch its
- * slot and complete its operation.
- */
- if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
- __wt_spin_unlock(session, &log->log_slot_lock);
- /*
- * This may not be initialized if we are starting at an
- * older log file version. So only signal if valid.
- */
- if (conn->log_wrlsn_cond != NULL)
- __wt_cond_signal(session, conn->log_wrlsn_cond);
- if (++yield_count < WT_THOUSAND)
- __wt_yield();
- else
- __wt_cond_wait(session, log->log_write_cond, 200, NULL);
- if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
- __wt_spin_lock(session, &log->log_slot_lock);
- }
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ int yield_count;
+
+ conn = S2C(session);
+ log = conn->log;
+ yield_count = 0;
+
+ while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) {
+ /*
+ * If we're on a locked path and the write LSN is not advancing, unlock in case an earlier
+ * thread is trying to switch its slot and complete its operation.
+ */
+ if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ /*
+ * This may not be initialized if we are starting at an older log file version. So only
+ * signal if valid.
+ */
+ if (conn->log_wrlsn_cond != NULL)
+ __wt_cond_signal(session, conn->log_wrlsn_cond);
+ if (++yield_count < WT_THOUSAND)
+ __wt_yield();
+ else
+ __wt_cond_wait(session, log->log_write_cond, 200, NULL);
+ if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
+ __wt_spin_lock(session, &log->log_slot_lock);
+ }
}
/*
* __log_fs_read --
- * Wrapper when reading from a log file.
+ * Wrapper when reading from a log file.
*/
static int
-__log_fs_read(WT_SESSION_IMPL *session,
- WT_FH *fh, wt_off_t offset, size_t len, void *buf)
+__log_fs_read(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
{
- WT_DECL_RET;
+ WT_DECL_RET;
- __wt_capacity_throttle(session, len, WT_THROTTLE_LOG);
- if ((ret = __wt_read(session, fh, offset, len, buf)) != 0)
- WT_RET_MSG(session, ret, "%s: log read failure", fh->name);
- return (ret);
+ __wt_capacity_throttle(session, len, WT_THROTTLE_LOG);
+ if ((ret = __wt_read(session, fh, offset, len, buf)) != 0)
+ WT_RET_MSG(session, ret, "%s: log read failure", fh->name);
+ return (ret);
}
/*
* __log_fs_write --
- * Wrapper when writing to a log file. If we're writing to a new log
- * file for the first time wait for writes to the previous log file.
+ * Wrapper when writing to a log file. If we're writing to a new log file for the first time
+ * wait for writes to the previous log file.
*/
static int
-__log_fs_write(WT_SESSION_IMPL *session,
- WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf)
+__log_fs_write(
+ WT_SESSION_IMPL *session, WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf)
{
- WT_DECL_RET;
-
- /*
- * If we're writing into a new log file and we're running in
- * compatibility mode to an older release, we have to wait for all
- * writes to the previous log file to complete otherwise there could
- * be a hole at the end of the previous log file that we cannot detect.
- *
- * NOTE: Check for a version less than the one writing the system
- * record since we've had a log version change without any actual
- * file format changes.
- */
- if (S2C(session)->log->log_version < WT_LOG_VERSION_SYSTEM &&
- slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) {
- __log_wait_for_earlier_slot(session, slot);
- WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn));
- }
- __wt_capacity_throttle(session, len, WT_THROTTLE_LOG);
- if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0)
- WT_PANIC_RET(session, ret,
- "%s: fatal log failure", slot->slot_fh->name);
- return (ret);
+ WT_DECL_RET;
+
+ /*
+ * If we're writing into a new log file and we're running in
+ * compatibility mode to an older release, we have to wait for all
+ * writes to the previous log file to complete otherwise there could
+ * be a hole at the end of the previous log file that we cannot detect.
+ *
+ * NOTE: Check for a version less than the one writing the system
+ * record since we've had a log version change without any actual
+ * file format changes.
+ */
+ if (S2C(session)->log->log_version < WT_LOG_VERSION_SYSTEM &&
+ slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) {
+ __log_wait_for_earlier_slot(session, slot);
+ WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn));
+ }
+ __wt_capacity_throttle(session, len, WT_THROTTLE_LOG);
+ if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0)
+ WT_PANIC_RET(session, ret, "%s: fatal log failure", slot->slot_fh->name);
+ return (ret);
}
/*
* __wt_log_ckpt --
- * Record the given LSN as the checkpoint LSN and signal the archive
- * thread as needed.
+ * Record the given LSN as the checkpoint LSN and signal the archive thread as needed.
*/
void
__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckpt_lsn)
{
- WT_CONNECTION_IMPL *conn;
- WT_LOG *log;
- int i;
-
- conn = S2C(session);
- log = conn->log;
- log->ckpt_lsn = *ckpt_lsn;
- if (conn->log_cond != NULL)
- __wt_cond_signal(session, conn->log_cond);
- /*
- * If we are storing debugging LSNs to retain additional log files
- * from archiving, then rotate the newest LSN into the array.
- */
- if (conn->debug_ckpt_cnt != 0) {
- for (i = (int)conn->debug_ckpt_cnt - 1; i > 0; --i)
- conn->debug_ckpt[i] = conn->debug_ckpt[i - 1];
- conn->debug_ckpt[0] = *ckpt_lsn;
- }
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ int i;
+
+ conn = S2C(session);
+ log = conn->log;
+ log->ckpt_lsn = *ckpt_lsn;
+ if (conn->log_cond != NULL)
+ __wt_cond_signal(session, conn->log_cond);
+ /*
+ * If we are storing debugging LSNs to retain additional log files from archiving, then rotate
+ * the newest LSN into the array.
+ */
+ if (conn->debug_ckpt_cnt != 0) {
+ for (i = (int)conn->debug_ckpt_cnt - 1; i > 0; --i)
+ conn->debug_ckpt[i] = conn->debug_ckpt[i - 1];
+ conn->debug_ckpt[0] = *ckpt_lsn;
+ }
}
/*
* __wt_log_flush_lsn --
- * Force out buffered records and return the LSN, either the
- * write_start_lsn or write_lsn depending on the argument.
+ * Force out buffered records and return the LSN, either the write_start_lsn or write_lsn
+ * depending on the argument.
*/
int
__wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start)
{
- WT_CONNECTION_IMPL *conn;
- WT_LOG *log;
-
- conn = S2C(session);
- log = conn->log;
- WT_RET(__wt_log_force_write(session, 1, NULL));
- __wt_log_wrlsn(session, NULL);
- if (start)
- *lsn = log->write_start_lsn;
- else
- *lsn = log->write_lsn;
- return (0);
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+ WT_RET(__wt_log_force_write(session, 1, NULL));
+ __wt_log_wrlsn(session, NULL);
+ if (start)
+ *lsn = log->write_start_lsn;
+ else
+ *lsn = log->write_lsn;
+ return (0);
}
/*
* __wt_log_background --
- * Record the given LSN as the background LSN and signal the
- * thread as needed.
+ * Record the given LSN as the background LSN and signal the thread as needed.
*/
void
__wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn)
{
- WT_CONNECTION_IMPL *conn;
- WT_LOG *log;
-
- conn = S2C(session);
- log = conn->log;
- /*
- * If a thread already set the LSN to a bigger LSN, we're done.
- */
- if (__wt_log_cmp(&session->bg_sync_lsn, lsn) > 0)
- return;
- session->bg_sync_lsn = *lsn;
-
- /*
- * Advance the logging subsystem background sync LSN if
- * needed.
- */
- __wt_spin_lock(session, &log->log_sync_lock);
- if (__wt_log_cmp(lsn, &log->bg_sync_lsn) > 0)
- log->bg_sync_lsn = *lsn;
- __wt_spin_unlock(session, &log->log_sync_lock);
- __wt_cond_signal(session, conn->log_file_cond);
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+ /*
+ * If a thread already set the LSN to a bigger LSN, we're done.
+ */
+ if (__wt_log_cmp(&session->bg_sync_lsn, lsn) > 0)
+ return;
+ session->bg_sync_lsn = *lsn;
+
+ /*
+ * Advance the logging subsystem background sync LSN if needed.
+ */
+ __wt_spin_lock(session, &log->log_sync_lock);
+ if (__wt_log_cmp(lsn, &log->bg_sync_lsn) > 0)
+ log->bg_sync_lsn = *lsn;
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ __wt_cond_signal(session, conn->log_file_cond);
}
/*
* __wt_log_force_sync --
- * Force a sync of the log and files.
+ * Force a sync of the log and files.
*/
int
__wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
{
- WT_DECL_RET;
- WT_FH *log_fh;
- WT_LOG *log;
- uint64_t fsync_duration_usecs, time_start, time_stop;
-
- log = S2C(session)->log;
- log_fh = NULL;
-
- /*
- * We need to wait for the previous log file to get written
- * to disk before we sync out the current one and advance
- * the LSN. Signal the worker thread because we know the
- * LSN has moved into a later log file and there should be a
- * log file ready to close.
- */
- while (log->sync_lsn.l.file < min_lsn->l.file) {
- __wt_cond_signal(session, S2C(session)->log_file_cond);
- __wt_cond_wait(session, log->log_sync_cond, 10000, NULL);
- }
- __wt_spin_lock(session, &log->log_sync_lock);
- WT_ASSERT(session, log->log_dir_fh != NULL);
- /*
- * Sync the directory if the log file entry hasn't been written
- * into the directory.
- */
- if (log->sync_dir_lsn.l.file < min_lsn->l.file) {
- __wt_verbose(session, WT_VERB_LOG,
- "log_force_sync: sync directory %s to LSN %" PRIu32
- "/%" PRIu32,
- log->log_dir_fh->name, min_lsn->l.file, min_lsn->l.offset);
- time_start = __wt_clock(session);
- WT_ERR(__wt_fsync(session, log->log_dir_fh, true));
- time_stop = __wt_clock(session);
- fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start);
- log->sync_dir_lsn = *min_lsn;
- WT_STAT_CONN_INCR(session, log_sync_dir);
- WT_STAT_CONN_INCRV(session,
- log_sync_dir_duration, fsync_duration_usecs);
- }
- /*
- * Sync the log file if needed.
- */
- if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) {
- /*
- * Get our own file handle to the log file. It is possible
- * for the file handle in the log structure to change out
- * from under us and either be NULL or point to a different
- * file than we want.
- */
- WT_ERR(__log_openfile(session, min_lsn->l.file, 0, &log_fh));
- __wt_verbose(session, WT_VERB_LOG,
- "log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32,
- log_fh->name, min_lsn->l.file, min_lsn->l.offset);
- time_start = __wt_clock(session);
- WT_ERR(__wt_fsync(session, log_fh, true));
- time_stop = __wt_clock(session);
- fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start);
- log->sync_lsn = *min_lsn;
- WT_STAT_CONN_INCR(session, log_sync);
- WT_STAT_CONN_INCRV(session,
- log_sync_duration, fsync_duration_usecs);
- __wt_cond_signal(session, log->log_sync_cond);
- }
+ WT_DECL_RET;
+ WT_FH *log_fh;
+ WT_LOG *log;
+ uint64_t fsync_duration_usecs, time_start, time_stop;
+
+ log = S2C(session)->log;
+ log_fh = NULL;
+
+ /*
+ * We need to wait for the previous log file to get written to disk before we sync out the
+ * current one and advance the LSN. Signal the worker thread because we know the LSN has moved
+ * into a later log file and there should be a log file ready to close.
+ */
+ while (log->sync_lsn.l.file < min_lsn->l.file) {
+ __wt_cond_signal(session, S2C(session)->log_file_cond);
+ __wt_cond_wait(session, log->log_sync_cond, 10000, NULL);
+ }
+ __wt_spin_lock(session, &log->log_sync_lock);
+ WT_ASSERT(session, log->log_dir_fh != NULL);
+ /*
+ * Sync the directory if the log file entry hasn't been written into the directory.
+ */
+ if (log->sync_dir_lsn.l.file < min_lsn->l.file) {
+ __wt_verbose(session, WT_VERB_LOG,
+ "log_force_sync: sync directory %s to LSN %" PRIu32 "/%" PRIu32, log->log_dir_fh->name,
+ min_lsn->l.file, min_lsn->l.offset);
+ time_start = __wt_clock(session);
+ WT_ERR(__wt_fsync(session, log->log_dir_fh, true));
+ time_stop = __wt_clock(session);
+ fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start);
+ log->sync_dir_lsn = *min_lsn;
+ WT_STAT_CONN_INCR(session, log_sync_dir);
+ WT_STAT_CONN_INCRV(session, log_sync_dir_duration, fsync_duration_usecs);
+ }
+ /*
+ * Sync the log file if needed.
+ */
+ if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) {
+ /*
+ * Get our own file handle to the log file. It is possible for the file handle in the log
+ * structure to change out from under us and either be NULL or point to a different file
+ * than we want.
+ */
+ WT_ERR(__log_openfile(session, min_lsn->l.file, 0, &log_fh));
+ __wt_verbose(session, WT_VERB_LOG, "log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32,
+ log_fh->name, min_lsn->l.file, min_lsn->l.offset);
+ time_start = __wt_clock(session);
+ WT_ERR(__wt_fsync(session, log_fh, true));
+ time_stop = __wt_clock(session);
+ fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start);
+ log->sync_lsn = *min_lsn;
+ WT_STAT_CONN_INCR(session, log_sync);
+ WT_STAT_CONN_INCRV(session, log_sync_duration, fsync_duration_usecs);
+ __wt_cond_signal(session, log->log_sync_cond);
+ }
err:
- __wt_spin_unlock(session, &log->log_sync_lock);
- if (log_fh != NULL)
- WT_TRET(__wt_close(session, &log_fh));
- return (ret);
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ if (log_fh != NULL)
+ WT_TRET(__wt_close(session, &log_fh));
+ return (ret);
}
/*
* __wt_log_needs_recovery --
- * Return 0 if we encounter a clean shutdown and 1 if recovery
- * must be run in the given variable.
+ * Return 0 if we encounter a clean shutdown and 1 if recovery must be run in the given
+ * variable.
*/
int
__wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp)
{
- WT_CONNECTION_IMPL *conn;
- WT_CURSOR *c;
- WT_DECL_RET;
- WT_ITEM dummy_key, dummy_value;
- WT_LOG *log;
- uint64_t dummy_txnid;
- uint32_t dummy_fileid, dummy_optype, rectype;
-
- /*
- * Default is to run recovery always (regardless of whether this
- * connection has logging enabled).
- */
- *recp = true;
-
- conn = S2C(session);
- log = conn->log;
-
- if (log == NULL)
- return (0);
-
- /*
- * See if there are any data modification records between the
- * checkpoint LSN and the end of the log. If there are none then
- * we can skip recovery.
- */
- WT_RET(__wt_curlog_open(session, "log:", NULL, &c));
- c->set_key(c, ckp_lsn->l.file, ckp_lsn->l.offset, 0);
- if ((ret = c->search(c)) == 0) {
- while ((ret = c->next(c)) == 0) {
- /*
- * The only thing we care about is the rectype.
- */
- WT_ERR(c->get_value(c, &dummy_txnid, &rectype,
- &dummy_optype, &dummy_fileid,
- &dummy_key, &dummy_value));
- if (rectype == WT_LOGREC_COMMIT)
- break;
- }
- /*
- * If we get to the end of the log, we can skip recovery.
- */
- if (ret == WT_NOTFOUND) {
- *recp = false;
- ret = 0;
- }
- } else if (ret == WT_NOTFOUND)
- /*
- * We should always find the checkpoint LSN as it now points
- * to the beginning of a written log record. But if we're
- * running recovery on an earlier database we may not. In
- * that case, we need to run recovery, don't return an error.
- */
- ret = 0;
- else
- WT_ERR(ret);
-
-err: WT_TRET(c->close(c));
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_ITEM dummy_key, dummy_value;
+ WT_LOG *log;
+ uint64_t dummy_txnid;
+ uint32_t dummy_fileid, dummy_optype, rectype;
+
+ /*
+ * Default is to run recovery always (regardless of whether this connection has logging
+ * enabled).
+ */
+ *recp = true;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ if (log == NULL)
+ return (0);
+
+ /*
+ * See if there are any data modification records between the checkpoint LSN and the end of the
+ * log. If there are none then we can skip recovery.
+ */
+ WT_RET(__wt_curlog_open(session, "log:", NULL, &c));
+ c->set_key(c, ckp_lsn->l.file, ckp_lsn->l.offset, 0);
+ if ((ret = c->search(c)) == 0) {
+ while ((ret = c->next(c)) == 0) {
+ /*
+ * The only thing we care about is the rectype.
+ */
+ WT_ERR(c->get_value(
+ c, &dummy_txnid, &rectype, &dummy_optype, &dummy_fileid, &dummy_key, &dummy_value));
+ if (rectype == WT_LOGREC_COMMIT)
+ break;
+ }
+ /*
+ * If we get to the end of the log, we can skip recovery.
+ */
+ if (ret == WT_NOTFOUND) {
+ *recp = false;
+ ret = 0;
+ }
+ } else if (ret == WT_NOTFOUND)
+ /*
+ * We should always find the checkpoint LSN as it now points to the beginning of a written
+ * log record. But if we're running recovery on an earlier database we may not. In that
+ * case, we need to run recovery, don't return an error.
+ */
+ ret = 0;
+ else
+ WT_ERR(ret);
+
+err:
+ WT_TRET(c->close(c));
+ return (ret);
}
/*
* __wt_log_written_reset --
- * Interface to reset the amount of log written during this
- * checkpoint period. Called from the checkpoint code.
+ * Interface to reset the amount of log written during this checkpoint period. Called from the
+ * checkpoint code.
*/
void
__wt_log_written_reset(WT_SESSION_IMPL *session)
{
- WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_IMPL *conn;
- conn = S2C(session);
+ conn = S2C(session);
- if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
- conn->log->log_written = 0;
+ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
+ conn->log->log_written = 0;
}
/*
* __wt_log_get_backup_files --
- * Retrieve the list of log files for taking a backup, either all of them
- * or only the active ones (those that are not candidates for archiving).
- * The caller is responsible for freeing the directory list returned.
+ * Retrieve the list of log files for taking a backup, either all of them or only the active
+ * ones (those that are not candidates for archiving). The caller is responsible for freeing the
+ * directory list returned.
*/
int
-__wt_log_get_backup_files(WT_SESSION_IMPL *session,
- char ***filesp, u_int *countp, uint32_t *maxid, bool active_only)
+__wt_log_get_backup_files(
+ WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, bool active_only)
{
- WT_DECL_RET;
- WT_LOG *log;
- uint32_t id, max, max_file, min_file;
- u_int count, i;
- char **files;
-
- *filesp = NULL;
- *countp = 0;
- *maxid = 0;
-
- id = 0;
- log = S2C(session)->log;
-
- /*
- * Capture the next file utilized for writing to the log, before forcing
- * a new log file. This represents the latest journal file that needs to
- * be copied. Note the checkpoint selected for backup may be writing to
- * an even later log file. In that case, copying the journal files is
- * correct, but wasteful.
- */
- max_file = log->alloc_lsn.l.file;
-
- /*
- * Capture the journal file the current checkpoint started in. The
- * current checkpoint or a later one may be selected for backing up,
- * requiring log files as early as this file. Together with max_file,
- * this defines the range of journal files to include.
- */
- min_file = log->ckpt_lsn.l.file;
-
- /*
- * Force the current slot to get written to the file. Also switch to
- * using a new log file. That log file will be removed from the list of
- * files returned. New writes will not be included in the backup.
- */
- if (active_only)
- F_SET(log, WT_LOG_FORCE_NEWFILE);
- WT_RET(__wt_log_force_write(session, 1, NULL));
- WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count));
-
- for (max = 0, i = 0; i < count; ) {
- WT_ERR(__wt_log_extract_lognum(session, files[i], &id));
- if (active_only &&
- (id < min_file || id > max_file)) {
- /*
- * Any files not being returned are individually freed
- * and the array adjusted.
- */
- __wt_free(session, files[i]);
- files[i] = files[count - 1];
- files[--count] = NULL;
- } else {
- if (id > max)
- max = id;
- i++;
- }
- }
-
- *maxid = max;
- *filesp = files;
- *countp = count;
-
- /*
- * Only free on error. The caller is responsible for calling free
- * once it is done using the returned list.
- */
- if (0) {
-err: WT_TRET(__wt_fs_directory_list_free(session, &files, count));
- }
- return (ret);
+ WT_DECL_RET;
+ WT_LOG *log;
+ uint32_t id, max, max_file, min_file;
+ u_int count, i;
+ char **files;
+
+ *filesp = NULL;
+ *countp = 0;
+ *maxid = 0;
+
+ id = 0;
+ log = S2C(session)->log;
+
+ /*
+ * Capture the next file utilized for writing to the log, before forcing a new log file. This
+ * represents the latest journal file that needs to be copied. Note the checkpoint selected for
+ * backup may be writing to an even later log file. In that case, copying the journal files is
+ * correct, but wasteful.
+ */
+ max_file = log->alloc_lsn.l.file;
+
+ /*
+ * Capture the journal file the current checkpoint started in. The current checkpoint or a later
+ * one may be selected for backing up, requiring log files as early as this file. Together with
+ * max_file, this defines the range of journal files to include.
+ */
+ min_file = log->ckpt_lsn.l.file;
+
+ /*
+ * Force the current slot to get written to the file. Also switch to using a new log file. That
+ * log file will be removed from the list of files returned. New writes will not be included in
+ * the backup.
+ */
+ if (active_only)
+ F_SET(log, WT_LOG_FORCE_NEWFILE);
+ WT_RET(__wt_log_force_write(session, 1, NULL));
+ WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count));
+
+ for (max = 0, i = 0; i < count;) {
+ WT_ERR(__wt_log_extract_lognum(session, files[i], &id));
+ if (active_only && (id < min_file || id > max_file)) {
+ /*
+ * Any files not being returned are individually freed and the array adjusted.
+ */
+ __wt_free(session, files[i]);
+ files[i] = files[count - 1];
+ files[--count] = NULL;
+ } else {
+ if (id > max)
+ max = id;
+ i++;
+ }
+ }
+
+ *maxid = max;
+ *filesp = files;
+ *countp = count;
+
+ /*
+ * Only free on error. The caller is responsible for calling free once it is done using the
+ * returned list.
+ */
+ if (0) {
+err:
+ WT_TRET(__wt_fs_directory_list_free(session, &files, count));
+ }
+ return (ret);
}
/*
* __log_filename --
- * Given a log number, return a WT_ITEM of a generated log file name
- * of the given prefix type.
+ * Given a log number, return a WT_ITEM of a generated log file name of the given prefix type.
*/
static int
-__log_filename(WT_SESSION_IMPL *session,
- uint32_t id, const char *file_prefix, WT_ITEM *buf)
+__log_filename(WT_SESSION_IMPL *session, uint32_t id, const char *file_prefix, WT_ITEM *buf)
{
- return (__wt_filename_construct(session,
- S2C(session)->log_path, file_prefix, UINTMAX_MAX, id, buf));
+ return (
+ __wt_filename_construct(session, S2C(session)->log_path, file_prefix, UINTMAX_MAX, id, buf));
}
/*
* __wt_log_extract_lognum --
- * Given a log file name, extract out the log number.
+ * Given a log file name, extract out the log number.
*/
int
-__wt_log_extract_lognum(
- WT_SESSION_IMPL *session, const char *name, uint32_t *id)
+__wt_log_extract_lognum(WT_SESSION_IMPL *session, const char *name, uint32_t *id)
{
- const char *p;
-
- if (id == NULL || name == NULL)
- WT_RET_MSG(session, EINVAL,
- "unexpected usage: no id or no name");
- if ((p = strrchr(name, '.')) == NULL ||
- /* NOLINTNEXTLINE(cert-err34-c) */
- sscanf(++p, "%" SCNu32, id) != 1)
- WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name);
- return (0);
+ const char *p;
+
+ if (id == NULL || name == NULL)
+ WT_RET_MSG(session, EINVAL, "unexpected usage: no id or no name");
+ if ((p = strrchr(name, '.')) == NULL ||
+ /* NOLINTNEXTLINE(cert-err34-c) */
+ sscanf(++p, "%" SCNu32, id) != 1)
+ WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name);
+ return (0);
}
/*
* __wt_log_reset --
- * Reset the existing log file to after the given file number.
- * Called from recovery when toggling logging back on, it was off
- * the previous open but it was on earlier before that toggle.
+ * Reset the existing log file to after the given file number. Called from recovery when
+ * toggling logging back on, it was off the previous open but it was on earlier before that
+ * toggle.
*/
int
__wt_log_reset(WT_SESSION_IMPL *session, uint32_t lognum)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_LOG *log;
- uint32_t old_lognum;
- u_int i, logcount;
- char **logfiles;
-
- conn = S2C(session);
- log = conn->log;
-
- if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ||
- log->fileid > lognum)
- return (0);
-
- WT_ASSERT(session, F_ISSET(conn, WT_CONN_RECOVERING));
- WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY));
- /*
- * We know we're single threaded and called from recovery only when
- * toggling logging back on. Therefore the only log files we have are
- * old and outdated and the new one created when logging opened before
- * recovery. We have to remove all old log files first and then create
- * the new one so that log file numbers are contiguous in the file
- * system.
- */
- WT_RET(__wt_close(session, &log->log_fh));
- WT_RET(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount));
- for (i = 0; i < logcount; i++) {
- WT_ERR(__wt_log_extract_lognum(
- session, logfiles[i], &old_lognum));
- WT_ASSERT(session, old_lognum < lognum || lognum == 1);
- WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, old_lognum));
- }
- log->fileid = lognum;
-
- /* Send in true to update connection creation LSNs. */
- WT_WITH_SLOT_LOCK(session, log,
- ret = __log_newfile(session, true, NULL));
- WT_ERR(__wt_log_slot_init(session, false));
-err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ uint32_t old_lognum;
+ u_int i, logcount;
+ char **logfiles;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || log->fileid > lognum)
+ return (0);
+
+ WT_ASSERT(session, F_ISSET(conn, WT_CONN_RECOVERING));
+ WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY));
+ /*
+ * We know we're single threaded and called from recovery only when toggling logging back on.
+ * Therefore the only log files we have are old and outdated and the new one created when
+ * logging opened before recovery. We have to remove all old log files first and then create the
+ * new one so that log file numbers are contiguous in the file system.
+ */
+ WT_RET(__wt_close(session, &log->log_fh));
+ WT_RET(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &old_lognum));
+ WT_ASSERT(session, old_lognum < lognum || lognum == 1);
+ WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, old_lognum));
+ }
+ log->fileid = lognum;
+
+ /* Send in true to update connection creation LSNs. */
+ WT_WITH_SLOT_LOCK(session, log, ret = __log_newfile(session, true, NULL));
+ WT_ERR(__wt_log_slot_init(session, false));
+err:
+ WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
+ return (ret);
}
/*
* __log_prealloc --
- * Pre-allocate a log file.
+ * Pre-allocate a log file.
*/
static int
__log_prealloc(WT_SESSION_IMPL *session, WT_FH *fh)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_LOG *log;
-
- conn = S2C(session);
- log = conn->log;
-
- /*
- * If the user configured zero filling, pre-allocate the log file
- * manually. Otherwise use the file extension method to create
- * and zero the log file based on what is available.
- */
- if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ZERO_FILL))
- return (__wt_file_zero(session, fh,
- log->first_record, conn->log_file_max));
-
- /* If configured to not extend the file, we're done. */
- if (conn->log_extend_len == 0)
- return (0);
-
- /*
- * We have exclusive access to the log file and there are no other
- * writes happening concurrently, so there are no locking issues.
- */
- ret = __wt_fextend(session, fh, conn->log_extend_len);
- return (ret == EBUSY || ret == ENOTSUP ? 0 : ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ /*
+ * If the user configured zero filling, pre-allocate the log file manually. Otherwise use the
+ * file extension method to create and zero the log file based on what is available.
+ */
+ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ZERO_FILL))
+ return (__wt_file_zero(session, fh, log->first_record, conn->log_file_max));
+
+ /* If configured to not extend the file, we're done. */
+ if (conn->log_extend_len == 0)
+ return (0);
+
+ /*
+ * We have exclusive access to the log file and there are no other writes happening
+ * concurrently, so there are no locking issues.
+ */
+ ret = __wt_fextend(session, fh, conn->log_extend_len);
+ return (ret == EBUSY || ret == ENOTSUP ? 0 : ret);
}
/*
* __log_size_fit --
- * Return whether or not recsize will fit in the log file.
+ * Return whether or not recsize will fit in the log file.
*/
static int
__log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize)
{
- WT_CONNECTION_IMPL *conn;
- WT_LOG *log;
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
- conn = S2C(session);
- log = conn->log;
- return (lsn->l.offset == log->first_record ||
- lsn->l.offset + (wt_off_t)recsize < conn->log_file_max);
+ conn = S2C(session);
+ log = conn->log;
+ return (
+ lsn->l.offset == log->first_record || lsn->l.offset + (wt_off_t)recsize < conn->log_file_max);
}
/*
* __log_decompress --
- * Decompress a log record.
+ * Decompress a log record.
*/
static int
__log_decompress(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out)
{
- WT_COMPRESSOR *compressor;
- WT_CONNECTION_IMPL *conn;
- WT_LOG_RECORD *logrec;
- size_t result_len, skip;
- uint32_t uncompressed_size;
-
- conn = S2C(session);
- logrec = (WT_LOG_RECORD *)in->mem;
- skip = WT_LOG_COMPRESS_SKIP;
- compressor = conn->log_compressor;
- if (compressor == NULL || compressor->decompress == NULL)
- WT_RET_MSG(session, WT_ERROR,
- "Compressed record with no configured compressor");
- uncompressed_size = logrec->mem_len;
- WT_RET(__wt_buf_initsize(session, out, uncompressed_size));
- memcpy(out->mem, in->mem, skip);
- WT_RET(compressor->decompress(compressor, &session->iface,
- (uint8_t *)in->mem + skip, in->size - skip,
- (uint8_t *)out->mem + skip,
- uncompressed_size - skip, &result_len));
-
- /*
- * If checksums were turned off because we're depending on the
- * decompression to fail on any corrupted data, we'll end up
- * here after corruption happens. If we're salvaging the file,
- * it's OK, otherwise it's really, really bad.
- */
- if (result_len != uncompressed_size - WT_LOG_COMPRESS_SKIP)
- WT_RET_MSG(session, WT_ERROR,
- "decompression failed with incorrect size");
-
- return (0);
+ WT_COMPRESSOR *compressor;
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG_RECORD *logrec;
+ size_t result_len, skip;
+ uint32_t uncompressed_size;
+
+ conn = S2C(session);
+ logrec = (WT_LOG_RECORD *)in->mem;
+ skip = WT_LOG_COMPRESS_SKIP;
+ compressor = conn->log_compressor;
+ if (compressor == NULL || compressor->decompress == NULL)
+ WT_RET_MSG(session, WT_ERROR, "Compressed record with no configured compressor");
+ uncompressed_size = logrec->mem_len;
+ WT_RET(__wt_buf_initsize(session, out, uncompressed_size));
+ memcpy(out->mem, in->mem, skip);
+ WT_RET(compressor->decompress(compressor, &session->iface, (uint8_t *)in->mem + skip,
+ in->size - skip, (uint8_t *)out->mem + skip, uncompressed_size - skip, &result_len));
+
+ /*
+ * If checksums were turned off because we're depending on the decompression to fail on any
+ * corrupted data, we'll end up here after corruption happens. If we're salvaging the file, it's
+ * OK, otherwise it's really, really bad.
+ */
+ if (result_len != uncompressed_size - WT_LOG_COMPRESS_SKIP)
+ WT_RET_MSG(session, WT_ERROR, "decompression failed with incorrect size");
+
+ return (0);
}
/*
* __log_decrypt --
- * Decrypt a log record.
+ * Decrypt a log record.
*/
static int
__log_decrypt(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out)
{
- WT_CONNECTION_IMPL *conn;
- WT_ENCRYPTOR *encryptor;
- WT_KEYED_ENCRYPTOR *kencryptor;
-
- conn = S2C(session);
- kencryptor = conn->kencryptor;
- if (kencryptor == NULL ||
- (encryptor = kencryptor->encryptor) == NULL ||
- encryptor->decrypt == NULL)
- WT_RET_MSG(session, WT_ERROR,
- "Encrypted record with no configured decrypt method");
-
- return (__wt_decrypt(session, encryptor, WT_LOG_ENCRYPT_SKIP, in, out));
+ WT_CONNECTION_IMPL *conn;
+ WT_ENCRYPTOR *encryptor;
+ WT_KEYED_ENCRYPTOR *kencryptor;
+
+ conn = S2C(session);
+ kencryptor = conn->kencryptor;
+ if (kencryptor == NULL || (encryptor = kencryptor->encryptor) == NULL ||
+ encryptor->decrypt == NULL)
+ WT_RET_MSG(session, WT_ERROR, "Encrypted record with no configured decrypt method");
+
+ return (__wt_decrypt(session, encryptor, WT_LOG_ENCRYPT_SKIP, in, out));
}
/*
* __wt_log_fill --
- * Copy a thread's log records into the assigned slot.
+ * Copy a thread's log records into the assigned slot.
*/
int
-__wt_log_fill(WT_SESSION_IMPL *session,
- WT_MYSLOT *myslot, bool force, WT_ITEM *record, WT_LSN *lsnp)
+__wt_log_fill(
+ WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool force, WT_ITEM *record, WT_LSN *lsnp)
{
- WT_DECL_RET;
-
- /*
- * Call write or copy into the buffer. For now the offset is the
- * real byte offset. If the offset becomes a unit of WT_LOG_ALIGN this
- * is where we would multiply by WT_LOG_ALIGN to get the real file byte
- * offset for write().
- */
- if (!force && !F_ISSET(myslot, WT_MYSLOT_UNBUFFERED))
- memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
- record->mem, record->size);
- else
- /*
- * If this is a force or unbuffered write, write it now.
- */
- WT_ERR(__log_fs_write(session, myslot->slot,
- myslot->offset + myslot->slot->slot_start_offset,
- record->size, record->mem));
-
- WT_STAT_CONN_INCRV(session, log_bytes_written, record->size);
- if (lsnp != NULL) {
- *lsnp = myslot->slot->slot_start_lsn;
- lsnp->l.offset += (uint32_t)myslot->offset;
- }
+ WT_DECL_RET;
+
+ /*
+ * Call write or copy into the buffer. For now the offset is the real byte offset. If the offset
+ * becomes a unit of WT_LOG_ALIGN this is where we would multiply by WT_LOG_ALIGN to get the
+ * real file byte offset for write().
+ */
+ if (!force && !F_ISSET(myslot, WT_MYSLOT_UNBUFFERED))
+ memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset, record->mem, record->size);
+ else
+ /*
+ * If this is a force or unbuffered write, write it now.
+ */
+ WT_ERR(__log_fs_write(session, myslot->slot,
+ myslot->offset + myslot->slot->slot_start_offset, record->size, record->mem));
+
+ WT_STAT_CONN_INCRV(session, log_bytes_written, record->size);
+ if (lsnp != NULL) {
+ *lsnp = myslot->slot->slot_start_lsn;
+ lsnp->l.offset += (uint32_t)myslot->offset;
+ }
err:
- if (ret != 0 && myslot->slot->slot_error == 0)
- myslot->slot->slot_error = ret;
- return (ret);
+ if (ret != 0 && myslot->slot->slot_error == 0)
+ myslot->slot->slot_error = ret;
+ return (ret);
}
/*
* __log_file_header --
- * Create and write a log file header into a file handle. If writing
- * into the main log, it will be called locked. If writing into a
- * pre-allocated log, it will be called unlocked.
+ * Create and write a log file header into a file handle. If writing into the main log, it will
+ * be called locked. If writing into a pre-allocated log, it will be called unlocked.
*/
static int
-__log_file_header(
- WT_SESSION_IMPL *session, WT_FH *fh, WT_LSN *end_lsn, bool prealloc)
+__log_file_header(WT_SESSION_IMPL *session, WT_FH *fh, WT_LSN *end_lsn, bool prealloc)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_ITEM(buf);
- WT_DECL_RET;
- WT_LOG *log;
- WT_LOGSLOT tmp;
- WT_LOG_DESC *desc;
- WT_LOG_RECORD *logrec;
- WT_MYSLOT myslot;
-
- conn = S2C(session);
- log = conn->log;
-
- /*
- * Set up the log descriptor record. Use a scratch buffer to
- * get correct alignment for direct I/O.
- */
- WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize);
- WT_RET(__wt_scr_alloc(session, log->allocsize, &buf));
- memset(buf->mem, 0, log->allocsize);
- buf->size = log->allocsize;
-
- logrec = (WT_LOG_RECORD *)buf->mem;
- desc = (WT_LOG_DESC *)logrec->record;
- desc->log_magic = WT_LOG_MAGIC;
- desc->version = log->log_version;
- desc->log_size = (uint64_t)conn->log_file_max;
- __wt_log_desc_byteswap(desc);
-
- /*
- * Now that the record is set up, initialize the record header.
- *
- * Checksum a little-endian version of the header, and write everything
- * in little-endian format. The checksum is (potentially) returned in a
- * big-endian format, swap it into place in a separate step.
- */
- logrec->len = log->allocsize;
- logrec->checksum = 0;
- __wt_log_record_byteswap(logrec);
- logrec->checksum = __wt_checksum(logrec, log->allocsize);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOGSLOT tmp;
+ WT_LOG_DESC *desc;
+ WT_LOG_RECORD *logrec;
+ WT_MYSLOT myslot;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ /*
+ * Set up the log descriptor record. Use a scratch buffer to get correct alignment for direct
+ * I/O.
+ */
+ WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize);
+ WT_RET(__wt_scr_alloc(session, log->allocsize, &buf));
+ memset(buf->mem, 0, log->allocsize);
+ buf->size = log->allocsize;
+
+ logrec = (WT_LOG_RECORD *)buf->mem;
+ desc = (WT_LOG_DESC *)logrec->record;
+ desc->log_magic = WT_LOG_MAGIC;
+ desc->version = log->log_version;
+ desc->log_size = (uint64_t)conn->log_file_max;
+ __wt_log_desc_byteswap(desc);
+
+ /*
+ * Now that the record is set up, initialize the record header.
+ *
+ * Checksum a little-endian version of the header, and write everything
+ * in little-endian format. The checksum is (potentially) returned in a
+ * big-endian format, swap it into place in a separate step.
+ */
+ logrec->len = log->allocsize;
+ logrec->checksum = 0;
+ __wt_log_record_byteswap(logrec);
+ logrec->checksum = __wt_checksum(logrec, log->allocsize);
#ifdef WORDS_BIGENDIAN
- logrec->checksum = __wt_bswap32(logrec->checksum);
+ logrec->checksum = __wt_bswap32(logrec->checksum);
#endif
- WT_CLEAR(tmp);
- memset(&myslot, 0, sizeof(myslot));
- myslot.slot = &tmp;
-
- /*
- * We may recursively call __wt_log_acquire to allocate log space for
- * the log descriptor record. Call __wt_log_fill to write it, but we
- * do not need to call __wt_log_release because we're not waiting for
- * any earlier operations to complete.
- */
- if (prealloc) {
- WT_ASSERT(session, fh != NULL);
- tmp.slot_fh = fh;
- } else {
- WT_ASSERT(session, fh == NULL);
- WT_ERR(__wt_log_acquire(session, log->allocsize, &tmp));
- }
- WT_ERR(__wt_log_fill(session, &myslot, true, buf, NULL));
- /*
- * Make sure the header gets to disk.
- */
- WT_ERR(__wt_fsync(session, tmp.slot_fh, true));
- if (end_lsn != NULL)
- *end_lsn = tmp.slot_end_lsn;
-
-err: __wt_scr_free(session, &buf);
- return (ret);
+ WT_CLEAR(tmp);
+ memset(&myslot, 0, sizeof(myslot));
+ myslot.slot = &tmp;
+
+ /*
+ * We may recursively call __wt_log_acquire to allocate log space for the log descriptor record.
+ * Call __wt_log_fill to write it, but we do not need to call __wt_log_release because we're not
+ * waiting for any earlier operations to complete.
+ */
+ if (prealloc) {
+ WT_ASSERT(session, fh != NULL);
+ tmp.slot_fh = fh;
+ } else {
+ WT_ASSERT(session, fh == NULL);
+ WT_ERR(__wt_log_acquire(session, log->allocsize, &tmp));
+ }
+ WT_ERR(__wt_log_fill(session, &myslot, true, buf, NULL));
+ /*
+ * Make sure the header gets to disk.
+ */
+ WT_ERR(__wt_fsync(session, tmp.slot_fh, true));
+ if (end_lsn != NULL)
+ *end_lsn = tmp.slot_end_lsn;
+
+err:
+ __wt_scr_free(session, &buf);
+ return (ret);
}
/*
* __log_openfile --
- * Open a log file with the given log file number and return the WT_FH.
+ * Open a log file with the given log file number and return the WT_FH.
*/
static int
-__log_openfile(
- WT_SESSION_IMPL *session, uint32_t id, uint32_t flags, WT_FH **fhp)
+__log_openfile(WT_SESSION_IMPL *session, uint32_t id, uint32_t flags, WT_FH **fhp)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_ITEM(buf);
- WT_DECL_RET;
- u_int wtopen_flags;
-
- conn = S2C(session);
- WT_RET(__wt_scr_alloc(session, 0, &buf));
- /*
- * If we are creating the file then we use a temporary file name.
- * Otherwise it is a log file name.
- */
- if (LF_ISSET(WT_LOG_OPEN_CREATE_OK)) {
- wtopen_flags = WT_FS_OPEN_CREATE;
- WT_ERR(__log_filename(session, id, WT_LOG_TMPNAME, buf));
- } else {
- wtopen_flags = 0;
- WT_ERR(__log_filename(session, id, WT_LOG_FILENAME, buf));
- }
- __wt_verbose(session, WT_VERB_LOG,
- "opening log %s", (const char *)buf->data);
- if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG))
- FLD_SET(wtopen_flags, WT_FS_OPEN_DIRECTIO);
- WT_ERR(__wt_open(
- session, buf->data, WT_FS_OPEN_FILE_TYPE_LOG, wtopen_flags, fhp));
-err: __wt_scr_free(session, &buf);
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ u_int wtopen_flags;
+
+ conn = S2C(session);
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ /*
+ * If we are creating the file then we use a temporary file name. Otherwise it is a log file
+ * name.
+ */
+ if (LF_ISSET(WT_LOG_OPEN_CREATE_OK)) {
+ wtopen_flags = WT_FS_OPEN_CREATE;
+ WT_ERR(__log_filename(session, id, WT_LOG_TMPNAME, buf));
+ } else {
+ wtopen_flags = 0;
+ WT_ERR(__log_filename(session, id, WT_LOG_FILENAME, buf));
+ }
+ __wt_verbose(session, WT_VERB_LOG, "opening log %s", (const char *)buf->data);
+ if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG))
+ FLD_SET(wtopen_flags, WT_FS_OPEN_DIRECTIO);
+ WT_ERR(__wt_open(session, buf->data, WT_FS_OPEN_FILE_TYPE_LOG, wtopen_flags, fhp));
+err:
+ __wt_scr_free(session, &buf);
+ return (ret);
}
/*
* __log_open_verify --
- * Open a log file with the given log file number, verify its
- * header and return various pieces of system information about
- * this log file.
+ * Open a log file with the given log file number, verify its header and return various pieces
+ * of system information about this log file.
*/
static int
-__log_open_verify(WT_SESSION_IMPL *session, uint32_t id, WT_FH **fhp,
- WT_LSN *lsnp, uint16_t *versionp, bool *need_salvagep)
+__log_open_verify(WT_SESSION_IMPL *session, uint32_t id, WT_FH **fhp, WT_LSN *lsnp,
+ uint16_t *versionp, bool *need_salvagep)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_ITEM(buf);
- WT_DECL_RET;
- WT_FH *fh;
- WT_LOG *log;
- WT_LOG_DESC *desc;
- WT_LOG_RECORD *logrec;
- uint32_t allocsize, rectype;
- const uint8_t *end, *p;
- bool need_salvage, salvage_mode;
-
- conn = S2C(session);
- fh = NULL;
- log = conn->log;
- need_salvage = false;
- WT_RET(__wt_scr_alloc(session, 0, &buf));
- salvage_mode = (need_salvagep != NULL &&
- F_ISSET(conn, WT_CONN_SALVAGE));
-
- if (log == NULL)
- allocsize = WT_LOG_ALIGN;
- else
- allocsize = log->allocsize;
- if (lsnp != NULL)
- WT_ZERO_LSN(lsnp);
- WT_ERR(__wt_buf_grow(session, buf, allocsize));
- memset(buf->mem, 0, allocsize);
-
- /*
- * Any operation that fails from here on out indicates corruption
- * that could be salvaged.
- */
- need_salvage = true;
-
- /*
- * Read in the log file header and verify it.
- */
- WT_ERR(__log_openfile(session, id, 0, &fh));
- WT_ERR(__log_fs_read(session, fh, 0, allocsize, buf->mem));
- logrec = (WT_LOG_RECORD *)buf->mem;
- __wt_log_record_byteswap(logrec);
- desc = (WT_LOG_DESC *)logrec->record;
- __wt_log_desc_byteswap(desc);
- if (desc->log_magic != WT_LOG_MAGIC) {
- if (salvage_mode)
- WT_ERR_MSG(session, WT_ERROR,
- "log file %s corrupted: Bad magic number %" PRIu32,
- fh->name, desc->log_magic);
- else
- WT_PANIC_RET(session, WT_ERROR,
- "log file %s corrupted: Bad magic number %" PRIu32,
- fh->name, desc->log_magic);
- }
- /*
- * We cannot read future log file formats.
- */
- if (desc->version > WT_LOG_VERSION)
- WT_ERR_MSG(session, WT_ERROR,
- "unsupported WiredTiger file version: this build"
- " only supports versions up to %d,"
- " and the file is version %" PRIu16,
- WT_LOG_VERSION, desc->version);
-
- /*
- * We error if the log version is less than the required minimum or
- * larger than the required maximum.
- */
- if (conn->req_max_major != WT_CONN_COMPAT_NONE &&
- desc->version > conn->log_req_max)
- WT_ERR_MSG(session, WT_ERROR,
- WT_COMPAT_MSG_PREFIX
- "unsupported WiredTiger file version: this build"
- " requires a maximum version of %" PRIu16 ","
- " and the file is version %" PRIu16,
- conn->log_req_max, desc->version);
-
- if (conn->req_min_major != WT_CONN_COMPAT_NONE &&
- desc->version < conn->log_req_min)
- WT_ERR_MSG(session, WT_ERROR,
- WT_COMPAT_MSG_PREFIX
- "unsupported WiredTiger file version: this build"
- " requires a minimum version of %" PRIu16 ","
- " and the file is version %" PRIu16,
- conn->log_req_min, desc->version);
-
- /*
- * Set up the return values since the header is valid.
- */
- if (versionp != NULL)
- *versionp = desc->version;
-
- /*
- * Skip reading in the previous LSN if log file is an old version
- * or if the caller doesn't care about the LSN. Otherwise read that
- * record in and set up the LSN. We already have a buffer that is
- * the correct size. Reuse it.
- */
- if (lsnp == NULL ||
- (desc->version < WT_LOG_VERSION_SYSTEM))
- goto err;
-
- memset(buf->mem, 0, allocsize);
- WT_ERR(__log_fs_read(session, fh, allocsize, allocsize, buf->mem));
- logrec = (WT_LOG_RECORD *)buf->mem;
- /*
- * We have a valid header but the system record is not there.
- * The log ends here. Return without setting the LSN.
- */
- if (logrec->len == 0) {
- __wt_verbose(session, WT_VERB_LOG,
- "Log %s found empty log after header", fh->name);
- goto err;
- }
-
- if (!__log_checksum_match(buf, allocsize))
- WT_ERR_MSG(session, WT_ERROR,
- "%s: System log record checksum mismatch", fh->name);
- __wt_log_record_byteswap(logrec);
- p = WT_LOG_SKIP_HEADER(buf->data);
- end = (const uint8_t *)buf->data + allocsize;
- WT_ERR(__wt_logrec_read(session, &p, end, &rectype));
- if (rectype != WT_LOGREC_SYSTEM)
- WT_ERR_MSG(session, WT_ERROR, "System log record missing");
- WT_ERR(__wt_log_recover_system(session, &p, end, lsnp));
-
-err: __wt_scr_free(session, &buf);
-
- /*
- * Return the file handle if needed, otherwise close it.
- */
- if (fhp != NULL && ret == 0)
- *fhp = fh;
- else if (ret != 0 && need_salvage && salvage_mode) {
- /* Let the caller know this file must be salvaged. */
- ret = 0;
- WT_TRET(__wt_close(session, &fh));
- if (fhp != NULL)
- *fhp = NULL;
- *need_salvagep = true;
- } else
- WT_TRET(__wt_close(session, &fh));
-
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_FH *fh;
+ WT_LOG *log;
+ WT_LOG_DESC *desc;
+ WT_LOG_RECORD *logrec;
+ uint32_t allocsize, rectype;
+ const uint8_t *end, *p;
+ bool need_salvage, salvage_mode;
+
+ conn = S2C(session);
+ fh = NULL;
+ log = conn->log;
+ need_salvage = false;
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ salvage_mode = (need_salvagep != NULL && F_ISSET(conn, WT_CONN_SALVAGE));
+
+ if (log == NULL)
+ allocsize = WT_LOG_ALIGN;
+ else
+ allocsize = log->allocsize;
+ if (lsnp != NULL)
+ WT_ZERO_LSN(lsnp);
+ WT_ERR(__wt_buf_grow(session, buf, allocsize));
+ memset(buf->mem, 0, allocsize);
+
+ /*
+ * Any operation that fails from here on out indicates corruption that could be salvaged.
+ */
+ need_salvage = true;
+
+ /*
+ * Read in the log file header and verify it.
+ */
+ WT_ERR(__log_openfile(session, id, 0, &fh));
+ WT_ERR(__log_fs_read(session, fh, 0, allocsize, buf->mem));
+ logrec = (WT_LOG_RECORD *)buf->mem;
+ __wt_log_record_byteswap(logrec);
+ desc = (WT_LOG_DESC *)logrec->record;
+ __wt_log_desc_byteswap(desc);
+ if (desc->log_magic != WT_LOG_MAGIC) {
+ if (salvage_mode)
+ WT_ERR_MSG(session, WT_ERROR, "log file %s corrupted: Bad magic number %" PRIu32,
+ fh->name, desc->log_magic);
+ else
+ WT_PANIC_RET(session, WT_ERROR, "log file %s corrupted: Bad magic number %" PRIu32,
+ fh->name, desc->log_magic);
+ }
+ /*
+ * We cannot read future log file formats.
+ */
+ if (desc->version > WT_LOG_VERSION)
+ WT_ERR_MSG(session, WT_ERROR,
+ "unsupported WiredTiger file version: this build"
+ " only supports versions up to %d,"
+ " and the file is version %" PRIu16,
+ WT_LOG_VERSION, desc->version);
+
+ /*
+ * We error if the log version is less than the required minimum or larger than the required
+ * maximum.
+ */
+ if (conn->req_max_major != WT_CONN_COMPAT_NONE && desc->version > conn->log_req_max)
+ WT_ERR_MSG(session, WT_ERROR, WT_COMPAT_MSG_PREFIX
+ "unsupported WiredTiger file version: this build"
+ " requires a maximum version of %" PRIu16
+ ","
+ " and the file is version %" PRIu16,
+ conn->log_req_max, desc->version);
+
+ if (conn->req_min_major != WT_CONN_COMPAT_NONE && desc->version < conn->log_req_min)
+ WT_ERR_MSG(session, WT_ERROR, WT_COMPAT_MSG_PREFIX
+ "unsupported WiredTiger file version: this build"
+ " requires a minimum version of %" PRIu16
+ ","
+ " and the file is version %" PRIu16,
+ conn->log_req_min, desc->version);
+
+ /*
+ * Set up the return values since the header is valid.
+ */
+ if (versionp != NULL)
+ *versionp = desc->version;
+
+ /*
+ * Skip reading in the previous LSN if log file is an old version or if the caller doesn't care
+ * about the LSN. Otherwise read that record in and set up the LSN. We already have a buffer
+ * that is the correct size. Reuse it.
+ */
+ if (lsnp == NULL || (desc->version < WT_LOG_VERSION_SYSTEM))
+ goto err;
+
+ memset(buf->mem, 0, allocsize);
+ WT_ERR(__log_fs_read(session, fh, allocsize, allocsize, buf->mem));
+ logrec = (WT_LOG_RECORD *)buf->mem;
+ /*
+ * We have a valid header but the system record is not there. The log ends here. Return without
+ * setting the LSN.
+ */
+ if (logrec->len == 0) {
+ __wt_verbose(session, WT_VERB_LOG, "Log %s found empty log after header", fh->name);
+ goto err;
+ }
+
+ if (!__log_checksum_match(buf, allocsize))
+ WT_ERR_MSG(session, WT_ERROR, "%s: System log record checksum mismatch", fh->name);
+ __wt_log_record_byteswap(logrec);
+ p = WT_LOG_SKIP_HEADER(buf->data);
+ end = (const uint8_t *)buf->data + allocsize;
+ WT_ERR(__wt_logrec_read(session, &p, end, &rectype));
+ if (rectype != WT_LOGREC_SYSTEM)
+ WT_ERR_MSG(session, WT_ERROR, "System log record missing");
+ WT_ERR(__wt_log_recover_system(session, &p, end, lsnp));
+
+err:
+ __wt_scr_free(session, &buf);
+
+ /*
+ * Return the file handle if needed, otherwise close it.
+ */
+ if (fhp != NULL && ret == 0)
+ *fhp = fh;
+ else if (ret != 0 && need_salvage && salvage_mode) {
+ /* Let the caller know this file must be salvaged. */
+ ret = 0;
+ WT_TRET(__wt_close(session, &fh));
+ if (fhp != NULL)
+ *fhp = NULL;
+ *need_salvagep = true;
+ } else
+ WT_TRET(__wt_close(session, &fh));
+
+ return (ret);
}
/*
* __log_record_verify --
- * Check that values of the log record header are valid.
- * No byteswap of the header has been done at this point.
+ * Check that values of the log record header are valid. No byteswap of the header has been done
+ * at this point.
*/
static int
-__log_record_verify(WT_SESSION_IMPL *session, WT_FH *log_fh, uint32_t offset,
- WT_LOG_RECORD *logrecp, bool *corrupt)
+__log_record_verify(
+ WT_SESSION_IMPL *session, WT_FH *log_fh, uint32_t offset, WT_LOG_RECORD *logrecp, bool *corrupt)
{
- WT_LOG_RECORD logrec;
- size_t i;
-
- *corrupt = false;
-
- /*
- * Make our own copy of the header so we can get the bytes in the
- * proper order.
- */
- logrec = *logrecp;
- __wt_log_record_byteswap(&logrec);
-
- if (F_ISSET(&logrec, ~(WT_LOG_RECORD_ALL_FLAGS))) {
- WT_RET(__wt_msg(session,
- "%s: log record at position %" PRIu32
- " has flag corruption 0x%" PRIx16, log_fh->name, offset,
- logrec.flags));
- *corrupt = true;
- }
- for (i = 0; i < sizeof(logrec.unused); i++)
- if (logrec.unused[i] != 0) {
- WT_RET(__wt_msg(session,
- "%s: log record at position %" PRIu32
- " has unused[%" WT_SIZET_FMT "] corruption 0x%"
- PRIx8, log_fh->name, offset, i, logrec.unused[i]));
- *corrupt = true;
- }
- if (logrec.mem_len != 0 && !F_ISSET(&logrec,
- WT_LOG_RECORD_COMPRESSED | WT_LOG_RECORD_ENCRYPTED)) {
- WT_RET(__wt_msg(session,
- "%s: log record at position %" PRIu32
- " has memory len corruption 0x%" PRIx32, log_fh->name,
- offset, logrec.mem_len));
- *corrupt = true;
- }
- if (logrec.len <= offsetof(WT_LOG_RECORD, record)) {
- WT_RET(__wt_msg(session,
- "%s: log record at position %" PRIu32
- " has record len corruption 0x%" PRIx32, log_fh->name,
- offset, logrec.len));
- *corrupt = true;
- }
- return (0);
+ WT_LOG_RECORD logrec;
+ size_t i;
+
+ *corrupt = false;
+
+ /*
+ * Make our own copy of the header so we can get the bytes in the proper order.
+ */
+ logrec = *logrecp;
+ __wt_log_record_byteswap(&logrec);
+
+ if (F_ISSET(&logrec, ~(WT_LOG_RECORD_ALL_FLAGS))) {
+ WT_RET(
+ __wt_msg(session, "%s: log record at position %" PRIu32 " has flag corruption 0x%" PRIx16,
+ log_fh->name, offset, logrec.flags));
+ *corrupt = true;
+ }
+ for (i = 0; i < sizeof(logrec.unused); i++)
+ if (logrec.unused[i] != 0) {
+ WT_RET(__wt_msg(session, "%s: log record at position %" PRIu32
+ " has unused[%" WT_SIZET_FMT "] corruption 0x%" PRIx8,
+ log_fh->name, offset, i, logrec.unused[i]));
+ *corrupt = true;
+ }
+ if (logrec.mem_len != 0 &&
+ !F_ISSET(&logrec, WT_LOG_RECORD_COMPRESSED | WT_LOG_RECORD_ENCRYPTED)) {
+ WT_RET(__wt_msg(session,
+ "%s: log record at position %" PRIu32 " has memory len corruption 0x%" PRIx32,
+ log_fh->name, offset, logrec.mem_len));
+ *corrupt = true;
+ }
+ if (logrec.len <= offsetof(WT_LOG_RECORD, record)) {
+ WT_RET(__wt_msg(session,
+ "%s: log record at position %" PRIu32 " has record len corruption 0x%" PRIx32,
+ log_fh->name, offset, logrec.len));
+ *corrupt = true;
+ }
+ return (0);
}
/*
* __log_alloc_prealloc --
- * Look for a pre-allocated log file and rename it to use as the next
- * real log file. Called locked.
+ * Look for a pre-allocated log file and rename it to use as the next real log file. Called
+ * locked.
*/
static int
__log_alloc_prealloc(WT_SESSION_IMPL *session, uint32_t to_num)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_ITEM(from_path);
- WT_DECL_ITEM(to_path);
- WT_DECL_RET;
- WT_LOG *log;
- uint32_t from_num;
- u_int logcount;
- char **logfiles;
- bool locked;
-
- conn = S2C(session);
- log = conn->log;
- logfiles = NULL;
- locked = false;
-
- /*
- * If there are no pre-allocated files, return WT_NOTFOUND.
- */
- WT_RET(__log_get_files_single(
- session, WT_LOG_PREPNAME, &logfiles, &logcount));
- if (logcount == 0)
- return (WT_NOTFOUND);
-
- /* We have a file to use. */
- WT_ERR(__wt_log_extract_lognum(session, logfiles[0], &from_num));
-
- WT_ERR(__wt_scr_alloc(session, 0, &from_path));
- WT_ERR(__wt_scr_alloc(session, 0, &to_path));
- WT_ERR(__log_filename(session, from_num, WT_LOG_PREPNAME, from_path));
- WT_ERR(__log_filename(session, to_num, WT_LOG_FILENAME, to_path));
- __wt_spin_lock(session, &log->log_fs_lock);
- locked = true;
- __wt_verbose(session, WT_VERB_LOG,
- "log_alloc_prealloc: rename log %s to %s",
- (const char *)from_path->data, (const char *)to_path->data);
- WT_STAT_CONN_INCR(session, log_prealloc_used);
- /*
- * All file setup, writing the header and pre-allocation was done
- * before. We only need to rename it.
- */
- WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data, false));
-
-err: __wt_scr_free(session, &from_path);
- __wt_scr_free(session, &to_path);
- if (locked)
- __wt_spin_unlock(session, &log->log_fs_lock);
- WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(from_path);
+ WT_DECL_ITEM(to_path);
+ WT_DECL_RET;
+ WT_LOG *log;
+ uint32_t from_num;
+ u_int logcount;
+ char **logfiles;
+ bool locked;
+
+ conn = S2C(session);
+ log = conn->log;
+ logfiles = NULL;
+ locked = false;
+
+ /*
+ * If there are no pre-allocated files, return WT_NOTFOUND.
+ */
+ WT_RET(__log_get_files_single(session, WT_LOG_PREPNAME, &logfiles, &logcount));
+ if (logcount == 0)
+ return (WT_NOTFOUND);
+
+ /* We have a file to use. */
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[0], &from_num));
+
+ WT_ERR(__wt_scr_alloc(session, 0, &from_path));
+ WT_ERR(__wt_scr_alloc(session, 0, &to_path));
+ WT_ERR(__log_filename(session, from_num, WT_LOG_PREPNAME, from_path));
+ WT_ERR(__log_filename(session, to_num, WT_LOG_FILENAME, to_path));
+ __wt_spin_lock(session, &log->log_fs_lock);
+ locked = true;
+ __wt_verbose(session, WT_VERB_LOG, "log_alloc_prealloc: rename log %s to %s",
+ (const char *)from_path->data, (const char *)to_path->data);
+ WT_STAT_CONN_INCR(session, log_prealloc_used);
+ /*
+ * All file setup, writing the header and pre-allocation was done before. We only need to rename
+ * it.
+ */
+ WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data, false));
+
+err:
+ __wt_scr_free(session, &from_path);
+ __wt_scr_free(session, &to_path);
+ if (locked)
+ __wt_spin_unlock(session, &log->log_fs_lock);
+ WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
+ return (ret);
}
/*
* __log_newfile --
- * Create the next log file and write the file header record into it.
+ * Create the next log file and write the file header record into it.
*/
static int
__log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_FH *log_fh;
- WT_LOG *log;
- WT_LSN end_lsn, logrec_lsn;
- u_int yield_cnt;
- bool create_log, skipp;
-
- conn = S2C(session);
- log = conn->log;
-
- /*
- * Set aside the log file handle to be closed later. Other threads
- * may still be using it to write to the log. If the log file size
- * is small we could fill a log file before the previous one is closed.
- * Wait for that to close.
- */
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
- for (yield_cnt = 0; log->log_close_fh != NULL;) {
- WT_STAT_CONN_INCR(session, log_close_yields);
- /*
- * Processing slots will conditionally signal the file close
- * server thread. But if we've tried a while, signal the
- * thread directly here.
- */
- __wt_log_wrlsn(session, NULL);
- if (++yield_cnt % WT_THOUSAND == 0) {
- __wt_spin_unlock(session, &log->log_slot_lock);
- __wt_cond_signal(session, conn->log_file_cond);
- __wt_spin_lock(session, &log->log_slot_lock);
- }
- if (++yield_cnt > WT_THOUSAND * 10)
- return (__wt_set_return(session, EBUSY));
- __wt_yield();
- }
- /*
- * Note, the file server worker thread requires the LSN be set once the
- * close file handle is set, force that ordering.
- */
- if (log->log_fh == NULL)
- log->log_close_fh = NULL;
- else {
- log->log_close_lsn = log->alloc_lsn;
- WT_PUBLISH(log->log_close_fh, log->log_fh);
- }
- log->fileid++;
-
- /*
- * If pre-allocating log files look for one; otherwise, or if we don't
- * find one, create a log file. We can't use pre-allocated log files
- * while a hot backup is in progress: applications can copy the files
- * in any way they choose, and a log file rename might confuse things.
- */
- create_log = true;
- if (conn->log_prealloc > 0 && !conn->hot_backup) {
- WT_WITH_HOTBACKUP_READ_LOCK(session,
- ret = __log_alloc_prealloc(session, log->fileid),
- &skipp);
-
- if (!skipp) {
- /*
- * If ret is 0 it means we found a pre-allocated file.
- * If ret is WT_NOTFOUND, create the new log file and
- * signal the server, we missed our pre-allocation.
- * If ret is non-zero but not WT_NOTFOUND, return the
- * error.
- */
- WT_RET_NOTFOUND_OK(ret);
- if (ret == 0)
- create_log = false;
- else {
- WT_STAT_CONN_INCR(session, log_prealloc_missed);
- if (conn->log_cond != NULL)
- __wt_cond_signal(
- session, conn->log_cond);
- }
- }
- }
- /*
- * If we need to create the log file, do so now.
- */
- if (create_log) {
- /*
- * Increment the missed pre-allocated file counter only
- * if a hot backup is not in progress. We are deliberately
- * not using pre-allocated log files during backup
- * (see comment above).
- */
- if (!conn->hot_backup)
- log->prep_missed++;
- WT_RET(__wt_log_allocfile(
- session, log->fileid, WT_LOG_FILENAME));
- }
- /*
- * Since the file system clears the output file handle pointer before
- * searching the handle list and filling in the new file handle,
- * we must pass in a local file handle. Otherwise there is a wide
- * window where another thread could see a NULL log file handle.
- */
- WT_RET(__log_open_verify(session, log->fileid, &log_fh, NULL, NULL,
- NULL));
- /*
- * Write the LSN at the end of the last record in the previous log file
- * as the first record in this log file.
- */
- if (log->fileid == 1)
- WT_INIT_LSN(&logrec_lsn);
- else
- logrec_lsn = log->alloc_lsn;
- /*
- * We need to setup the LSNs. Set the end LSN and alloc LSN to
- * the end of the header.
- */
- WT_SET_LSN(&log->alloc_lsn, log->fileid, WT_LOG_END_HEADER);
- /*
- * If we're running the version where we write a system record
- * do so now and update the alloc_lsn.
- */
- if (log->log_version >= WT_LOG_VERSION_SYSTEM) {
- WT_RET(__wt_log_system_record(session,
- log_fh, &logrec_lsn));
- WT_SET_LSN(&log->alloc_lsn, log->fileid, log->first_record);
- }
- end_lsn = log->alloc_lsn;
- WT_PUBLISH(log->log_fh, log_fh);
-
- /*
- * If we're called from connection creation code, we need to update
- * the LSNs since we're the only write in progress.
- */
- if (conn_open) {
- WT_RET(__wt_fsync(session, log->log_fh, true));
- log->sync_lsn = end_lsn;
- log->write_lsn = end_lsn;
- log->write_start_lsn = end_lsn;
- }
- log->dirty_lsn = log->alloc_lsn;
- if (created != NULL)
- *created = create_log;
- return (0);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *log_fh;
+ WT_LOG *log;
+ WT_LSN end_lsn, logrec_lsn;
+ u_int yield_cnt;
+ bool create_log, skipp;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ /*
+ * Set aside the log file handle to be closed later. Other threads may still be using it to
+ * write to the log. If the log file size is small we could fill a log file before the previous
+ * one is closed. Wait for that to close.
+ */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+ for (yield_cnt = 0; log->log_close_fh != NULL;) {
+ WT_STAT_CONN_INCR(session, log_close_yields);
+ /*
+ * Processing slots will conditionally signal the file close server thread. But if we've
+ * tried a while, signal the thread directly here.
+ */
+ __wt_log_wrlsn(session, NULL);
+ if (++yield_cnt % WT_THOUSAND == 0) {
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ __wt_cond_signal(session, conn->log_file_cond);
+ __wt_spin_lock(session, &log->log_slot_lock);
+ }
+ if (++yield_cnt > WT_THOUSAND * 10)
+ return (__wt_set_return(session, EBUSY));
+ __wt_yield();
+ }
+ /*
+ * Note, the file server worker thread requires the LSN be set once the close file handle is
+ * set, force that ordering.
+ */
+ if (log->log_fh == NULL)
+ log->log_close_fh = NULL;
+ else {
+ log->log_close_lsn = log->alloc_lsn;
+ WT_PUBLISH(log->log_close_fh, log->log_fh);
+ }
+ log->fileid++;
+
+ /*
+ * If pre-allocating log files look for one; otherwise, or if we don't find one, create a log
+ * file. We can't use pre-allocated log files while a hot backup is in progress: applications
+ * can copy the files in any way they choose, and a log file rename might confuse things.
+ */
+ create_log = true;
+ if (conn->log_prealloc > 0 && !conn->hot_backup) {
+ WT_WITH_HOTBACKUP_READ_LOCK(
+ session, ret = __log_alloc_prealloc(session, log->fileid), &skipp);
+
+ if (!skipp) {
+ /*
+ * If ret is 0 it means we found a pre-allocated file. If ret is WT_NOTFOUND, create the
+ * new log file and signal the server, we missed our pre-allocation. If ret is non-zero
+ * but not WT_NOTFOUND, return the error.
+ */
+ WT_RET_NOTFOUND_OK(ret);
+ if (ret == 0)
+ create_log = false;
+ else {
+ WT_STAT_CONN_INCR(session, log_prealloc_missed);
+ if (conn->log_cond != NULL)
+ __wt_cond_signal(session, conn->log_cond);
+ }
+ }
+ }
+ /*
+ * If we need to create the log file, do so now.
+ */
+ if (create_log) {
+ /*
+ * Increment the missed pre-allocated file counter only
+ * if a hot backup is not in progress. We are deliberately
+ * not using pre-allocated log files during backup
+ * (see comment above).
+ */
+ if (!conn->hot_backup)
+ log->prep_missed++;
+ WT_RET(__wt_log_allocfile(session, log->fileid, WT_LOG_FILENAME));
+ }
+ /*
+ * Since the file system clears the output file handle pointer before searching the handle list
+ * and filling in the new file handle, we must pass in a local file handle. Otherwise there is a
+ * wide window where another thread could see a NULL log file handle.
+ */
+ WT_RET(__log_open_verify(session, log->fileid, &log_fh, NULL, NULL, NULL));
+ /*
+ * Write the LSN at the end of the last record in the previous log file as the first record in
+ * this log file.
+ */
+ if (log->fileid == 1)
+ WT_INIT_LSN(&logrec_lsn);
+ else
+ logrec_lsn = log->alloc_lsn;
+ /*
+ * We need to setup the LSNs. Set the end LSN and alloc LSN to the end of the header.
+ */
+ WT_SET_LSN(&log->alloc_lsn, log->fileid, WT_LOG_END_HEADER);
+ /*
+ * If we're running the version where we write a system record do so now and update the
+ * alloc_lsn.
+ */
+ if (log->log_version >= WT_LOG_VERSION_SYSTEM) {
+ WT_RET(__wt_log_system_record(session, log_fh, &logrec_lsn));
+ WT_SET_LSN(&log->alloc_lsn, log->fileid, log->first_record);
+ }
+ end_lsn = log->alloc_lsn;
+ WT_PUBLISH(log->log_fh, log_fh);
+
+ /*
+ * If we're called from connection creation code, we need to update the LSNs since we're the
+ * only write in progress.
+ */
+ if (conn_open) {
+ WT_RET(__wt_fsync(session, log->log_fh, true));
+ log->sync_lsn = end_lsn;
+ log->write_lsn = end_lsn;
+ log->write_start_lsn = end_lsn;
+ }
+ log->dirty_lsn = log->alloc_lsn;
+ if (created != NULL)
+ *created = create_log;
+ return (0);
}
/*
* __log_set_version --
- * Set version related information under lock.
+ * Set version related information under lock.
*/
static int
-__log_set_version(WT_SESSION_IMPL *session, uint16_t version,
- uint32_t first_rec, bool live_chg, bool downgrade)
+__log_set_version(
+ WT_SESSION_IMPL *session, uint16_t version, uint32_t first_rec, bool live_chg, bool downgrade)
{
- WT_CONNECTION_IMPL *conn;
- WT_LOG *log;
-
- conn = S2C(session);
- log = conn->log;
-
- log->log_version = version;
- log->first_record = first_rec;
- if (downgrade)
- FLD_SET(conn->log_flags, WT_CONN_LOG_DOWNGRADED);
- else
- FLD_CLR(conn->log_flags, WT_CONN_LOG_DOWNGRADED);
- if (live_chg)
- F_SET(log, WT_LOG_FORCE_NEWFILE);
- if (!F_ISSET(conn, WT_CONN_READONLY))
- return (__log_prealloc_remove(session));
-
- return (0);
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ log->log_version = version;
+ log->first_record = first_rec;
+ if (downgrade)
+ FLD_SET(conn->log_flags, WT_CONN_LOG_DOWNGRADED);
+ else
+ FLD_CLR(conn->log_flags, WT_CONN_LOG_DOWNGRADED);
+ if (live_chg)
+ F_SET(log, WT_LOG_FORCE_NEWFILE);
+ if (!F_ISSET(conn, WT_CONN_READONLY))
+ return (__log_prealloc_remove(session));
+
+ return (0);
}
/*
* __wt_log_set_version --
- * Change the version number in logging. Will be done with locking.
- * We need to force the log file to advance and remove all old
- * pre-allocated files.
+ * Change the version number in logging. Will be done with locking. We need to force the log
+ * file to advance and remove all old pre-allocated files.
*/
int
-__wt_log_set_version(WT_SESSION_IMPL *session, uint16_t version,
- uint32_t first_rec, bool downgrade, bool live_chg, uint32_t *lognump)
+__wt_log_set_version(WT_SESSION_IMPL *session, uint16_t version, uint32_t first_rec, bool downgrade,
+ bool live_chg, uint32_t *lognump)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_LOG *log;
-
- conn = S2C(session);
- log = conn->log;
-
- /*
- * The steps are:
- * - Set up versions and remove files under lock.
- * - Set a flag so that the next slot change forces a file change.
- * - Force out the slot that is currently active in the current log.
- * - Write a log record to force a record into the new log file.
- */
- WT_WITH_SLOT_LOCK(session, log,
- ret = __log_set_version(session,
- version, first_rec, live_chg, downgrade));
- if (!live_chg)
- return (ret);
- WT_ERR(ret);
- /*
- * A new log file will be used when we force out the earlier slot.
- */
- WT_ERR(__wt_log_force_write(session, 1, NULL));
-
- /*
- * We need to write a record to the new version log file so that
- * a potential checkpoint finds LSNs in that new log file and
- * an archive correctly removes all earlier logs.
- * Write an internal printf record.
- */
- WT_ERR(__wt_log_printf(session,
- "COMPATIBILITY: Version now %" PRIu16, log->log_version));
- if (lognump != NULL)
- *lognump = log->alloc_lsn.l.file;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ /*
+ * The steps are:
+ * - Set up versions and remove files under lock.
+ * - Set a flag so that the next slot change forces a file change.
+ * - Force out the slot that is currently active in the current log.
+ * - Write a log record to force a record into the new log file.
+ */
+ WT_WITH_SLOT_LOCK(
+ session, log, ret = __log_set_version(session, version, first_rec, live_chg, downgrade));
+ if (!live_chg)
+ return (ret);
+ WT_ERR(ret);
+ /*
+ * A new log file will be used when we force out the earlier slot.
+ */
+ WT_ERR(__wt_log_force_write(session, 1, NULL));
+
+ /*
+ * We need to write a record to the new version log file so that a potential checkpoint finds
+ * LSNs in that new log file and an archive correctly removes all earlier logs. Write an
+ * internal printf record.
+ */
+ WT_ERR(__wt_log_printf(session, "COMPATIBILITY: Version now %" PRIu16, log->log_version));
+ if (lognump != NULL)
+ *lognump = log->alloc_lsn.l.file;
err:
- return (ret);
+ return (ret);
}
/*
* __wt_log_acquire --
- * Called serially when switching slots. Can be called recursively
- * from __log_newfile when we change log files.
+ * Called serially when switching slots. Can be called recursively from __log_newfile when we
+ * change log files.
*/
int
__wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
{
- WT_CONNECTION_IMPL *conn;
- WT_LOG *log;
- bool created_log;
-
- conn = S2C(session);
- log = conn->log;
- created_log = true;
- /*
- * Add recsize to alloc_lsn. Save our starting LSN
- * where the previous allocation finished for the release LSN.
- * That way when log files switch, we're waiting for the correct LSN
- * from outstanding writes.
- */
- WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
- /*
- * We need to set the release LSN earlier, before a log file change.
- */
- slot->slot_release_lsn = log->alloc_lsn;
- /*
- * Make sure that the size can fit in the file. Proactively switch
- * if it cannot. This reduces, but does not eliminate, log files
- * that exceed the maximum file size. We want to minimize the risk
- * of an error due to no space.
- */
- if (F_ISSET(log, WT_LOG_FORCE_NEWFILE) ||
- !__log_size_fit(session, &log->alloc_lsn, recsize)) {
- WT_RET(__log_newfile(session, false, &created_log));
- F_CLR(log, WT_LOG_FORCE_NEWFILE);
- if (log->log_close_fh != NULL)
- F_SET(slot, WT_SLOT_CLOSEFH);
- }
-
- /*
- * Pre-allocate on the first real write into the log file, if it
- * was just created (i.e. not pre-allocated).
- */
- if (log->alloc_lsn.l.offset == log->first_record && created_log)
- WT_RET(__log_prealloc(session, log->log_fh));
- /*
- * Initialize the slot for activation.
- */
- __wt_log_slot_activate(session, slot);
-
- return (0);
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ bool created_log;
+
+ conn = S2C(session);
+ log = conn->log;
+ created_log = true;
+ /*
+ * Add recsize to alloc_lsn. Save our starting LSN where the previous allocation finished for
+ * the release LSN. That way when log files switch, we're waiting for the correct LSN from
+ * outstanding writes.
+ */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+ /*
+ * We need to set the release LSN earlier, before a log file change.
+ */
+ slot->slot_release_lsn = log->alloc_lsn;
+ /*
+ * Make sure that the size can fit in the file. Proactively switch if it cannot. This reduces,
+ * but does not eliminate, log files that exceed the maximum file size. We want to minimize the
+ * risk of an error due to no space.
+ */
+ if (F_ISSET(log, WT_LOG_FORCE_NEWFILE) || !__log_size_fit(session, &log->alloc_lsn, recsize)) {
+ WT_RET(__log_newfile(session, false, &created_log));
+ F_CLR(log, WT_LOG_FORCE_NEWFILE);
+ if (log->log_close_fh != NULL)
+ F_SET(slot, WT_SLOT_CLOSEFH);
+ }
+
+ /*
+ * Pre-allocate on the first real write into the log file, if it was just created (i.e. not
+ * pre-allocated).
+ */
+ if (log->alloc_lsn.l.offset == log->first_record && created_log)
+ WT_RET(__log_prealloc(session, log->log_fh));
+ /*
+ * Initialize the slot for activation.
+ */
+ __wt_log_slot_activate(session, slot);
+
+ return (0);
}
/*
* __log_truncate_file --
- * Truncate a log file to the specified offset.
- *
- * If the underlying file system doesn't support truncate then we need to
- * zero out the rest of the file, doing an effective truncate.
+ * Truncate a log file to the specified offset. If the underlying file system doesn't support
+ * truncate then we need to zero out the rest of the file, doing an effective truncate.
*/
static int
__log_truncate_file(WT_SESSION_IMPL *session, WT_FH *log_fh, wt_off_t offset)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_LOG *log;
- bool skipp;
-
- conn = S2C(session);
- log = conn->log;
-
- if (!F_ISSET(log, WT_LOG_TRUNCATE_NOTSUP) && !conn->hot_backup) {
- WT_WITH_HOTBACKUP_READ_LOCK(session,
- ret = __wt_ftruncate(
- session, log_fh, offset), &skipp);
- if (!skipp) {
- if (ret != ENOTSUP)
- return (ret);
- F_SET(log, WT_LOG_TRUNCATE_NOTSUP);
- }
- }
-
- return (__wt_file_zero(session, log_fh, offset, conn->log_file_max));
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ bool skipp;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ if (!F_ISSET(log, WT_LOG_TRUNCATE_NOTSUP) && !conn->hot_backup) {
+ WT_WITH_HOTBACKUP_READ_LOCK(session, ret = __wt_ftruncate(session, log_fh, offset), &skipp);
+ if (!skipp) {
+ if (ret != ENOTSUP)
+ return (ret);
+ F_SET(log, WT_LOG_TRUNCATE_NOTSUP);
+ }
+ }
+
+ return (__wt_file_zero(session, log_fh, offset, conn->log_file_max));
}
/*
* __log_truncate --
- * Truncate the log to the given LSN. If this_log is set, it will only
- * truncate the log file indicated in the given LSN. If not set,
- * it will truncate between the given LSN and the trunc_lsn. That is,
- * since we pre-allocate log files, it will free that space and allow the
- * log to be traversed. We use the trunc_lsn because logging has already
- * opened the new/next log file before recovery ran. If salvage_mode is
- * set, we verify headers of log files visited and recreate them if they
- * are damaged. This function assumes we are in recovery or other
- * dedicated time and not during live running.
+ * Truncate the log to the given LSN. If this_log is set, it will only truncate the log file
+ * indicated in the given LSN. If not set, it will truncate between the given LSN and the
+ * trunc_lsn. That is, since we pre-allocate log files, it will free that space and allow the
+ * log to be traversed. We use the trunc_lsn because logging has already opened the new/next log
+ * file before recovery ran. If salvage_mode is set, we verify headers of log files visited and
+ * recreate them if they are damaged. This function assumes we are in recovery or other
+ * dedicated time and not during live running.
*/
static int
-__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, bool this_log,
- bool salvage_mode)
+__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, bool this_log, bool salvage_mode)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_FH *log_fh;
- WT_LOG *log;
- uint32_t lognum, salvage_first, salvage_last;
- u_int i, logcount;
- char **logfiles;
- bool need_salvage, opened;
-
- conn = S2C(session);
- log = conn->log;
- log_fh = NULL;
- logcount = 0;
- logfiles = NULL;
- salvage_first = salvage_last = 0;
- need_salvage = false;
-
- /*
- * Truncate the log file to the given LSN.
- *
- * It's possible the underlying file system doesn't support truncate
- * (there are existing examples), which is fine, but we don't want to
- * repeatedly do the setup work just to find that out every time. Check
- * before doing work, and if there's a not-supported error, turn off
- * future truncates.
- */
- WT_ERR(__log_openfile(session, lsn->l.file, 0, &log_fh));
- WT_ERR(__log_truncate_file(session, log_fh, lsn->l.offset));
- WT_ERR(__wt_fsync(session, log_fh, true));
- WT_ERR(__wt_close(session, &log_fh));
-
- if (salvage_mode)
- WT_ERR(__wt_msg(session,
- "salvage: log file %" PRIu32 " truncated", lsn->l.file));
-
- /*
- * If we just want to truncate the current log, return and skip
- * looking for intervening logs.
- */
- if (this_log)
- goto err;
- WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount));
- for (i = 0; i < logcount; i++) {
- WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
- if (lognum > lsn->l.file && lognum < log->trunc_lsn.l.file) {
- opened = false;
- if (salvage_mode) {
- /*
- * When salvaging, we verify that the
- * header of the log file is valid.
- * If not, create a new, empty one.
- */
- need_salvage = false;
- WT_ERR(__log_open_verify(session, lognum,
- &log_fh, NULL, NULL, &need_salvage));
- if (need_salvage) {
- WT_ASSERT(session, log_fh == NULL);
- WT_ERR(__wt_log_remove(session,
- WT_LOG_FILENAME, lognum));
- WT_ERR(__wt_log_allocfile(session,
- lognum, WT_LOG_FILENAME));
- } else
- opened = true;
-
- if (salvage_first == 0)
- salvage_first = lognum;
- salvage_last = lognum;
- }
- if (!opened)
- WT_ERR(__log_openfile(session, lognum, 0,
- &log_fh));
- /*
- * If there are intervening files pre-allocated,
- * truncate them to the end of the log file header.
- */
- WT_ERR(__log_truncate_file(
- session, log_fh, log->first_record));
- WT_ERR(__wt_fsync(session, log_fh, true));
- WT_ERR(__wt_close(session, &log_fh));
- }
- }
-err: WT_TRET(__wt_close(session, &log_fh));
- WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
- if (salvage_first != 0) {
- if (salvage_last > salvage_first)
- WT_TRET(__wt_msg(session,
- "salvage: log files %" PRIu32 "-%" PRIu32
- " truncated at beginning", salvage_first,
- salvage_last));
- else
- WT_TRET(__wt_msg(session,
- "salvage: log file %" PRIu32
- " truncated at beginning", salvage_first));
- }
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *log_fh;
+ WT_LOG *log;
+ uint32_t lognum, salvage_first, salvage_last;
+ u_int i, logcount;
+ char **logfiles;
+ bool need_salvage, opened;
+
+ conn = S2C(session);
+ log = conn->log;
+ log_fh = NULL;
+ logcount = 0;
+ logfiles = NULL;
+ salvage_first = salvage_last = 0;
+ need_salvage = false;
+
+ /*
+ * Truncate the log file to the given LSN.
+ *
+ * It's possible the underlying file system doesn't support truncate
+ * (there are existing examples), which is fine, but we don't want to
+ * repeatedly do the setup work just to find that out every time. Check
+ * before doing work, and if there's a not-supported error, turn off
+ * future truncates.
+ */
+ WT_ERR(__log_openfile(session, lsn->l.file, 0, &log_fh));
+ WT_ERR(__log_truncate_file(session, log_fh, lsn->l.offset));
+ WT_ERR(__wt_fsync(session, log_fh, true));
+ WT_ERR(__wt_close(session, &log_fh));
+
+ if (salvage_mode)
+ WT_ERR(__wt_msg(session, "salvage: log file %" PRIu32 " truncated", lsn->l.file));
+
+ /*
+ * If we just want to truncate the current log, return and skip looking for intervening logs.
+ */
+ if (this_log)
+ goto err;
+ WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ if (lognum > lsn->l.file && lognum < log->trunc_lsn.l.file) {
+ opened = false;
+ if (salvage_mode) {
+ /*
+ * When salvaging, we verify that the header of the log file is valid. If not,
+ * create a new, empty one.
+ */
+ need_salvage = false;
+ WT_ERR(__log_open_verify(session, lognum, &log_fh, NULL, NULL, &need_salvage));
+ if (need_salvage) {
+ WT_ASSERT(session, log_fh == NULL);
+ WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, lognum));
+ WT_ERR(__wt_log_allocfile(session, lognum, WT_LOG_FILENAME));
+ } else
+ opened = true;
+
+ if (salvage_first == 0)
+ salvage_first = lognum;
+ salvage_last = lognum;
+ }
+ if (!opened)
+ WT_ERR(__log_openfile(session, lognum, 0, &log_fh));
+ /*
+ * If there are intervening files pre-allocated, truncate them to the end of the log
+ * file header.
+ */
+ WT_ERR(__log_truncate_file(session, log_fh, log->first_record));
+ WT_ERR(__wt_fsync(session, log_fh, true));
+ WT_ERR(__wt_close(session, &log_fh));
+ }
+ }
+err:
+ WT_TRET(__wt_close(session, &log_fh));
+ WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
+ if (salvage_first != 0) {
+ if (salvage_last > salvage_first)
+ WT_TRET(
+ __wt_msg(session, "salvage: log files %" PRIu32 "-%" PRIu32 " truncated at beginning",
+ salvage_first, salvage_last));
+ else
+ WT_TRET(__wt_msg(
+ session, "salvage: log file %" PRIu32 " truncated at beginning", salvage_first));
+ }
+ return (ret);
}
/*
* __wt_log_allocfile --
- * Given a log number, create a new log file by writing the header,
- * pre-allocating the file and moving it to the destination name.
+ * Given a log number, create a new log file by writing the header, pre-allocating the file and
+ * moving it to the destination name.
*/
int
-__wt_log_allocfile(
- WT_SESSION_IMPL *session, uint32_t lognum, const char *dest)
+__wt_log_allocfile(WT_SESSION_IMPL *session, uint32_t lognum, const char *dest)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_ITEM(from_path);
- WT_DECL_ITEM(to_path);
- WT_DECL_RET;
- WT_FH *log_fh;
- WT_LOG *log;
- uint32_t tmp_id;
-
- conn = S2C(session);
- log = conn->log;
- log_fh = NULL;
-
- /*
- * Preparing a log file entails creating a temporary file:
- * - Writing the header.
- * - Truncating to the offset of the first record.
- * - Pre-allocating the file if needed.
- * - Renaming it to the desired file name.
- */
- WT_RET(__wt_scr_alloc(session, 0, &from_path));
- WT_ERR(__wt_scr_alloc(session, 0, &to_path));
- tmp_id = __wt_atomic_add32(&log->tmp_fileid, 1);
- WT_ERR(__log_filename(session, tmp_id, WT_LOG_TMPNAME, from_path));
- WT_ERR(__log_filename(session, lognum, dest, to_path));
- __wt_spin_lock(session, &log->log_fs_lock);
- /*
- * Set up the temporary file.
- */
- WT_ERR(__log_openfile(session, tmp_id, WT_LOG_OPEN_CREATE_OK, &log_fh));
- WT_ERR(__log_file_header(session, log_fh, NULL, true));
- WT_ERR(__log_prealloc(session, log_fh));
- WT_ERR(__wt_fsync(session, log_fh, true));
- WT_ERR(__wt_close(session, &log_fh));
- __wt_verbose(session, WT_VERB_LOG,
- "log_allocfile: rename %s to %s",
- (const char *)from_path->data, (const char *)to_path->data);
- /*
- * Rename it into place and make it available.
- */
- WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data, false));
-
-err: __wt_scr_free(session, &from_path);
- __wt_scr_free(session, &to_path);
- __wt_spin_unlock(session, &log->log_fs_lock);
- WT_TRET(__wt_close(session, &log_fh));
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(from_path);
+ WT_DECL_ITEM(to_path);
+ WT_DECL_RET;
+ WT_FH *log_fh;
+ WT_LOG *log;
+ uint32_t tmp_id;
+
+ conn = S2C(session);
+ log = conn->log;
+ log_fh = NULL;
+
+ /*
+ * Preparing a log file entails creating a temporary file:
+ * - Writing the header.
+ * - Truncating to the offset of the first record.
+ * - Pre-allocating the file if needed.
+ * - Renaming it to the desired file name.
+ */
+ WT_RET(__wt_scr_alloc(session, 0, &from_path));
+ WT_ERR(__wt_scr_alloc(session, 0, &to_path));
+ tmp_id = __wt_atomic_add32(&log->tmp_fileid, 1);
+ WT_ERR(__log_filename(session, tmp_id, WT_LOG_TMPNAME, from_path));
+ WT_ERR(__log_filename(session, lognum, dest, to_path));
+ __wt_spin_lock(session, &log->log_fs_lock);
+ /*
+ * Set up the temporary file.
+ */
+ WT_ERR(__log_openfile(session, tmp_id, WT_LOG_OPEN_CREATE_OK, &log_fh));
+ WT_ERR(__log_file_header(session, log_fh, NULL, true));
+ WT_ERR(__log_prealloc(session, log_fh));
+ WT_ERR(__wt_fsync(session, log_fh, true));
+ WT_ERR(__wt_close(session, &log_fh));
+ __wt_verbose(session, WT_VERB_LOG, "log_allocfile: rename %s to %s",
+ (const char *)from_path->data, (const char *)to_path->data);
+ /*
+ * Rename it into place and make it available.
+ */
+ WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data, false));
+
+err:
+ __wt_scr_free(session, &from_path);
+ __wt_scr_free(session, &to_path);
+ __wt_spin_unlock(session, &log->log_fs_lock);
+ WT_TRET(__wt_close(session, &log_fh));
+ return (ret);
}
/*
* __wt_log_remove --
- * Given a log number, remove that log file.
+ * Given a log number, remove that log file.
*/
int
-__wt_log_remove(WT_SESSION_IMPL *session,
- const char *file_prefix, uint32_t lognum)
+__wt_log_remove(WT_SESSION_IMPL *session, const char *file_prefix, uint32_t lognum)
{
- WT_DECL_ITEM(path);
- WT_DECL_RET;
-
- WT_RET(__wt_scr_alloc(session, 0, &path));
- WT_ERR(__log_filename(session, lognum, file_prefix, path));
- __wt_verbose(session, WT_VERB_LOG,
- "log_remove: remove log %s", (const char *)path->data);
- WT_ERR(__wt_fs_remove(session, path->data, false));
-err: __wt_scr_free(session, &path);
- return (ret);
+ WT_DECL_ITEM(path);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 0, &path));
+ WT_ERR(__log_filename(session, lognum, file_prefix, path));
+ __wt_verbose(session, WT_VERB_LOG, "log_remove: remove log %s", (const char *)path->data);
+ WT_ERR(__wt_fs_remove(session, path->data, false));
+err:
+ __wt_scr_free(session, &path);
+ return (ret);
}
/*
* __wt_log_open --
- * Open the appropriate log file for the connection. The purpose is
- * to find the last log file that exists, open it and set our initial
- * LSNs to the end of that file. If none exist, call __log_newfile
- * to create it.
+ * Open the appropriate log file for the connection. The purpose is to find the last log file
+ * that exists, open it and set our initial LSNs to the end of that file. If none exist, call
+ * __log_newfile to create it.
*/
int
__wt_log_open(WT_SESSION_IMPL *session)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_LOG *log;
- uint32_t firstlog, lastlog, lognum;
- uint16_t version;
- u_int i, logcount;
- char **logfiles;
- bool need_salvage;
-
- conn = S2C(session);
- log = conn->log;
- logfiles = NULL;
- logcount = 0;
-
- /*
- * Open up a file handle to the log directory if we haven't.
- */
- if (log->log_dir_fh == NULL) {
- __wt_verbose(session, WT_VERB_LOG,
- "log_open: open fh to directory %s", conn->log_path);
- WT_RET(__wt_open(session, conn->log_path,
- WT_FS_OPEN_FILE_TYPE_DIRECTORY, 0, &log->log_dir_fh));
- }
-
- if (!F_ISSET(conn, WT_CONN_READONLY))
- WT_ERR(__log_prealloc_remove(session));
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ uint32_t firstlog, lastlog, lognum;
+ uint16_t version;
+ u_int i, logcount;
+ char **logfiles;
+ bool need_salvage;
+
+ conn = S2C(session);
+ log = conn->log;
+ logfiles = NULL;
+ logcount = 0;
+
+ /*
+ * Open up a file handle to the log directory if we haven't.
+ */
+ if (log->log_dir_fh == NULL) {
+ __wt_verbose(session, WT_VERB_LOG, "log_open: open fh to directory %s", conn->log_path);
+ WT_RET(
+ __wt_open(session, conn->log_path, WT_FS_OPEN_FILE_TYPE_DIRECTORY, 0, &log->log_dir_fh));
+ }
+
+ if (!F_ISSET(conn, WT_CONN_READONLY))
+ WT_ERR(__log_prealloc_remove(session));
again:
- /*
- * Now look at the log files and set our LSNs.
- */
- lastlog = 0;
- firstlog = UINT32_MAX;
- need_salvage = false;
-
- WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount));
- for (i = 0; i < logcount; i++) {
- WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
- lastlog = WT_MAX(lastlog, lognum);
- firstlog = WT_MIN(firstlog, lognum);
- }
- log->fileid = lastlog;
- __wt_verbose(session, WT_VERB_LOG,
- "log_open: first log %" PRIu32 " last log %" PRIu32,
- firstlog, lastlog);
- if (firstlog == UINT32_MAX) {
- WT_ASSERT(session, logcount == 0);
- WT_INIT_LSN(&log->first_lsn);
- } else {
- WT_SET_LSN(&log->first_lsn, firstlog, 0);
- /*
- * If we have existing log files, check the last log now before
- * we create a new log file so that we can detect an unsupported
- * version before modifying the file space.
- */
- WT_ERR(__log_open_verify(session, lastlog, NULL, NULL,
- &version, &need_salvage));
-
- /*
- * If we were asked to salvage and the last log file was
- * indeed corrupt, remove it and try all over again.
- */
- if (need_salvage) {
- WT_ERR(__wt_log_remove(
- session, WT_LOG_FILENAME, lastlog));
- WT_ERR(__wt_msg(session,
- "salvage: log file %" PRIu32 " removed", lastlog));
- WT_ERR(__wt_fs_directory_list_free(session, &logfiles,
- logcount));
- logfiles = NULL;
- goto again;
- }
- }
-
- /*
- * Start logging at the beginning of the next log file, no matter
- * where the previous log file ends.
- */
- if (!F_ISSET(conn, WT_CONN_READONLY)) {
- WT_WITH_SLOT_LOCK(session, log,
- ret = __log_newfile(session, true, NULL));
- WT_ERR(ret);
- }
-
- /* If we found log files, save the new state. */
- if (logcount > 0) {
- /*
- * If we're running in a downgraded mode and there are earlier
- * logs detect if they're at a higher version. If so, we need
- * to force recovery (to write a full checkpoint) and force
- * archiving to remove all higher version logs.
- */
- if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_DOWNGRADED)) {
- for (i = 0; i < logcount; ++i) {
- WT_ERR(__wt_log_extract_lognum(
- session, logfiles[i], &lognum));
- /*
- * By sending in a NULL file handle, we don't
- * have to close the file.
- */
- WT_ERR(__log_open_verify(session,
- lognum, NULL, NULL, &version, NULL));
- /*
- * If we find any log file at the wrong version
- * set the flag and we're done.
- */
- if (log->log_version != version) {
- FLD_SET(conn->log_flags,
- WT_CONN_LOG_FORCE_DOWNGRADE);
- break;
- }
- }
- }
- log->trunc_lsn = log->alloc_lsn;
- FLD_SET(conn->log_flags, WT_CONN_LOG_EXISTED);
- }
-
-err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
- if (ret == 0)
- F_SET(log, WT_LOG_OPENED);
- return (ret);
+ /*
+ * Now look at the log files and set our LSNs.
+ */
+ lastlog = 0;
+ firstlog = UINT32_MAX;
+ need_salvage = false;
+
+ WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ lastlog = WT_MAX(lastlog, lognum);
+ firstlog = WT_MIN(firstlog, lognum);
+ }
+ log->fileid = lastlog;
+ __wt_verbose(
+ session, WT_VERB_LOG, "log_open: first log %" PRIu32 " last log %" PRIu32, firstlog, lastlog);
+ if (firstlog == UINT32_MAX) {
+ WT_ASSERT(session, logcount == 0);
+ WT_INIT_LSN(&log->first_lsn);
+ } else {
+ WT_SET_LSN(&log->first_lsn, firstlog, 0);
+ /*
+ * If we have existing log files, check the last log now before we create a new log file so
+ * that we can detect an unsupported version before modifying the file space.
+ */
+ WT_ERR(__log_open_verify(session, lastlog, NULL, NULL, &version, &need_salvage));
+
+ /*
+ * If we were asked to salvage and the last log file was indeed corrupt, remove it and try
+ * all over again.
+ */
+ if (need_salvage) {
+ WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, lastlog));
+ WT_ERR(__wt_msg(session, "salvage: log file %" PRIu32 " removed", lastlog));
+ WT_ERR(__wt_fs_directory_list_free(session, &logfiles, logcount));
+ logfiles = NULL;
+ goto again;
+ }
+ }
+
+ /*
+ * Start logging at the beginning of the next log file, no matter where the previous log file
+ * ends.
+ */
+ if (!F_ISSET(conn, WT_CONN_READONLY)) {
+ WT_WITH_SLOT_LOCK(session, log, ret = __log_newfile(session, true, NULL));
+ WT_ERR(ret);
+ }
+
+ /* If we found log files, save the new state. */
+ if (logcount > 0) {
+ /*
+ * If we're running in a downgraded mode and there are earlier logs detect if they're at a
+ * higher version. If so, we need to force recovery (to write a full checkpoint) and force
+ * archiving to remove all higher version logs.
+ */
+ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_DOWNGRADED)) {
+ for (i = 0; i < logcount; ++i) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ /*
+ * By sending in a NULL file handle, we don't have to close the file.
+ */
+ WT_ERR(__log_open_verify(session, lognum, NULL, NULL, &version, NULL));
+ /*
+ * If we find any log file at the wrong version set the flag and we're done.
+ */
+ if (log->log_version != version) {
+ FLD_SET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE);
+ break;
+ }
+ }
+ }
+ log->trunc_lsn = log->alloc_lsn;
+ FLD_SET(conn->log_flags, WT_CONN_LOG_EXISTED);
+ }
+
+err:
+ WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
+ if (ret == 0)
+ F_SET(log, WT_LOG_OPENED);
+ return (ret);
}
/*
* __wt_log_close --
- * Close the log file.
+ * Close the log file.
*/
int
__wt_log_close(WT_SESSION_IMPL *session)
{
- WT_CONNECTION_IMPL *conn;
- WT_LOG *log;
-
- conn = S2C(session);
- log = conn->log;
-
- if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) {
- __wt_verbose(session, WT_VERB_LOG,
- "closing old log %s", log->log_close_fh->name);
- if (!F_ISSET(conn, WT_CONN_READONLY))
- WT_RET(__wt_fsync(session, log->log_close_fh, true));
- WT_RET(__wt_close(session, &log->log_close_fh));
- }
- if (log->log_fh != NULL) {
- __wt_verbose(session, WT_VERB_LOG,
- "closing log %s", log->log_fh->name);
- if (!F_ISSET(conn, WT_CONN_READONLY))
- WT_RET(__wt_fsync(session, log->log_fh, true));
- WT_RET(__wt_close(session, &log->log_fh));
- log->log_fh = NULL;
- }
- if (log->log_dir_fh != NULL) {
- __wt_verbose(session, WT_VERB_LOG,
- "closing log directory %s", log->log_dir_fh->name);
- if (!F_ISSET(conn, WT_CONN_READONLY))
- WT_RET(__wt_fsync(session, log->log_dir_fh, true));
- WT_RET(__wt_close(session, &log->log_dir_fh));
- log->log_dir_fh = NULL;
- }
- F_CLR(log, WT_LOG_OPENED);
- return (0);
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) {
+ __wt_verbose(session, WT_VERB_LOG, "closing old log %s", log->log_close_fh->name);
+ if (!F_ISSET(conn, WT_CONN_READONLY))
+ WT_RET(__wt_fsync(session, log->log_close_fh, true));
+ WT_RET(__wt_close(session, &log->log_close_fh));
+ }
+ if (log->log_fh != NULL) {
+ __wt_verbose(session, WT_VERB_LOG, "closing log %s", log->log_fh->name);
+ if (!F_ISSET(conn, WT_CONN_READONLY))
+ WT_RET(__wt_fsync(session, log->log_fh, true));
+ WT_RET(__wt_close(session, &log->log_fh));
+ log->log_fh = NULL;
+ }
+ if (log->log_dir_fh != NULL) {
+ __wt_verbose(session, WT_VERB_LOG, "closing log directory %s", log->log_dir_fh->name);
+ if (!F_ISSET(conn, WT_CONN_READONLY))
+ WT_RET(__wt_fsync(session, log->log_dir_fh, true));
+ WT_RET(__wt_close(session, &log->log_dir_fh));
+ log->log_dir_fh = NULL;
+ }
+ F_CLR(log, WT_LOG_OPENED);
+ return (0);
}
/*
* __log_has_hole --
- * Determine if the current offset represents a hole in the log
- * file (i.e. there is valid data somewhere after the hole), or
- * if this is the end of this log file and the remainder of the
- * file is zeroes.
+ * Determine if the current offset represents a hole in the log file (i.e. there is valid data
+ * somewhere after the hole), or if this is the end of this log file and the remainder of the
+ * file is zeroes.
*/
static int
-__log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t log_size,
- wt_off_t offset, wt_off_t *error_offset, bool *hole)
+__log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t log_size, wt_off_t offset,
+ wt_off_t *error_offset, bool *hole)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_LOG *log;
- WT_LOG_RECORD *logrec;
- wt_off_t off, remainder;
- size_t allocsize, buf_left, bufsz, rdlen;
- char *buf, *p, *zerobuf;
- bool corrupt;
-
- *error_offset = 0;
- corrupt = *hole = false;
-
- conn = S2C(session);
- log = conn->log;
- remainder = log_size - offset;
-
- /*
- * It can be very slow looking for the last real record in the log
- * in very small chunks. Walk a megabyte at a time. If we find a
- * part of the log that is not just zeroes we know this log file
- * has a hole in it.
- */
- buf = zerobuf = NULL;
- if (log == NULL || log->allocsize < WT_MEGABYTE)
- bufsz = WT_MEGABYTE;
- else
- bufsz = log->allocsize;
-
- if ((size_t)remainder < bufsz)
- bufsz = (size_t)remainder;
- WT_RET(__wt_calloc_def(session, bufsz, &buf));
- WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf));
-
- /*
- * Read in a chunk starting at the given offset.
- * Compare against a known zero byte chunk.
- */
- for (off = offset; remainder > 0;
- remainder -= (wt_off_t)rdlen, off += (wt_off_t)rdlen) {
- rdlen = WT_MIN(bufsz, (size_t)remainder);
- WT_ERR(__log_fs_read(session, fh, off, rdlen, buf));
- allocsize = (log == NULL ? WT_LOG_ALIGN : log->allocsize);
- if (memcmp(buf, zerobuf, rdlen) != 0) {
- /*
- * Find where the next log record starts after the
- * hole.
- */
- for (p = buf, buf_left = rdlen; buf_left > 0;
- buf_left -= rdlen, p += rdlen) {
- rdlen = WT_MIN(allocsize, buf_left);
- if (memcmp(p, zerobuf, rdlen) != 0)
- break;
- }
- /*
- * A presumed log record begins here where the buffer
- * becomes non-zero. If we have enough of a log record
- * present in the buffer, we either have a valid header
- * or corruption. Verify the header of this record to
- * determine whether it is just a hole or corruption.
- *
- * We don't bother making this check for backup copies,
- * as records may have their beginning zeroed, hence
- * the part after a hole may in fact be the middle of
- * the record.
- */
- if (!F_ISSET(conn, WT_CONN_WAS_BACKUP)) {
- logrec = (WT_LOG_RECORD *)p;
- if (buf_left >= sizeof(WT_LOG_RECORD)) {
- off += p - buf;
- WT_ERR(__log_record_verify(session, fh,
- (uint32_t)off, logrec, &corrupt));
- if (corrupt)
- *error_offset = off;
- }
- }
- *hole = true;
- break;
- }
- }
-
-err: __wt_free(session, buf);
- __wt_free(session, zerobuf);
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOG_RECORD *logrec;
+ wt_off_t off, remainder;
+ size_t allocsize, buf_left, bufsz, rdlen;
+ char *buf, *p, *zerobuf;
+ bool corrupt;
+
+ *error_offset = 0;
+ corrupt = *hole = false;
+
+ conn = S2C(session);
+ log = conn->log;
+ remainder = log_size - offset;
+
+ /*
+ * It can be very slow looking for the last real record in the log in very small chunks. Walk a
+ * megabyte at a time. If we find a part of the log that is not just zeroes we know this log
+ * file has a hole in it.
+ */
+ buf = zerobuf = NULL;
+ if (log == NULL || log->allocsize < WT_MEGABYTE)
+ bufsz = WT_MEGABYTE;
+ else
+ bufsz = log->allocsize;
+
+ if ((size_t)remainder < bufsz)
+ bufsz = (size_t)remainder;
+ WT_RET(__wt_calloc_def(session, bufsz, &buf));
+ WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf));
+
+ /*
+ * Read in a chunk starting at the given offset. Compare against a known zero byte chunk.
+ */
+ for (off = offset; remainder > 0; remainder -= (wt_off_t)rdlen, off += (wt_off_t)rdlen) {
+ rdlen = WT_MIN(bufsz, (size_t)remainder);
+ WT_ERR(__log_fs_read(session, fh, off, rdlen, buf));
+ allocsize = (log == NULL ? WT_LOG_ALIGN : log->allocsize);
+ if (memcmp(buf, zerobuf, rdlen) != 0) {
+ /*
+ * Find where the next log record starts after the hole.
+ */
+ for (p = buf, buf_left = rdlen; buf_left > 0; buf_left -= rdlen, p += rdlen) {
+ rdlen = WT_MIN(allocsize, buf_left);
+ if (memcmp(p, zerobuf, rdlen) != 0)
+ break;
+ }
+ /*
+ * A presumed log record begins here where the buffer
+ * becomes non-zero. If we have enough of a log record
+ * present in the buffer, we either have a valid header
+ * or corruption. Verify the header of this record to
+ * determine whether it is just a hole or corruption.
+ *
+ * We don't bother making this check for backup copies,
+ * as records may have their beginning zeroed, hence
+ * the part after a hole may in fact be the middle of
+ * the record.
+ */
+ if (!F_ISSET(conn, WT_CONN_WAS_BACKUP)) {
+ logrec = (WT_LOG_RECORD *)p;
+ if (buf_left >= sizeof(WT_LOG_RECORD)) {
+ off += p - buf;
+ WT_ERR(__log_record_verify(session, fh, (uint32_t)off, logrec, &corrupt));
+ if (corrupt)
+ *error_offset = off;
+ }
+ }
+ *hole = true;
+ break;
+ }
+ }
+
+err:
+ __wt_free(session, buf);
+ __wt_free(session, zerobuf);
+ return (ret);
}
/*
* __log_check_partial_write --
- * Determine if the log record may be a partial write. If that's
- * possible, return true, otherwise false.
- *
- * Since the log file is initially zeroed up to a predetermined size,
- * any record that falls within that boundary that ends in one or
- * more zeroes may be partial (or the initial record may have been
- * padded with zeroes before writing). The only way we have any certainty
- * is if the last byte is non-zero, when that happens, we know that
- * the write cannot be partial.
+ * Determine if the log record may be a partial write. If that's possible, return true,
+ * otherwise false. Since the log file is initially zeroed up to a predetermined size, any
+ * record that falls within that boundary that ends in one or more zeroes may be partial (or the
+ * initial record may have been padded with zeroes before writing). The only way we have any
+ * certainty is if the last byte is non-zero, when that happens, we know that the write cannot
+ * be partial.
*/
static bool
-__log_check_partial_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
- uint32_t reclen)
+__log_check_partial_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint32_t reclen)
{
- uint8_t *rec;
-
- WT_UNUSED(session);
-
- /*
- * We only check the final byte since that's the only way have any
- * certainty. Even if the second to last byte is non-zero and the
- * last byte is zero, that could still technically be the result of
- * a partial write, however unlikely it may be.
- */
- rec = buf->mem;
- return (reclen > 0 && rec[reclen - 1] == 0);
+ uint8_t *rec;
+
+ WT_UNUSED(session);
+
+ /*
+ * We only check the final byte since that's the only way have any certainty. Even if the second
+ * to last byte is non-zero and the last byte is zero, that could still technically be the
+ * result of a partial write, however unlikely it may be.
+ */
+ rec = buf->mem;
+ return (reclen > 0 && rec[reclen - 1] == 0);
}
/*
* __wt_log_release --
- * Release a log slot.
+ * Release a log slot.
*/
int
__wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_LOG *log;
- WT_LSN sync_lsn;
- uint64_t fsync_duration_usecs, time_start, time_stop;
- int64_t release_buffered, release_bytes;
- bool locked;
-
- conn = S2C(session);
- log = conn->log;
- locked = false;
- if (freep != NULL)
- *freep = 1;
- release_buffered = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state);
- release_bytes = release_buffered + slot->slot_unbuffered;
-
- /*
- * Checkpoints can be configured based on amount of log written.
- * Add in this log record to the sum and if needed, signal the
- * checkpoint condition. The logging subsystem manages the
- * accumulated field. There is a bit of layering violation
- * here checking the connection ckpt field and using its
- * condition.
- */
- if (WT_CKPT_LOGSIZE(conn)) {
- log->log_written += (wt_off_t)release_bytes;
- __wt_checkpoint_signal(session, log->log_written);
- }
-
- /* Write the buffered records */
- if (release_buffered != 0)
- WT_ERR(__log_fs_write(session, slot, slot->slot_start_offset,
- (size_t)release_buffered, slot->slot_buf.mem));
-
- /*
- * If we have to wait for a synchronous operation, we do not pass
- * handling of this slot off to the worker thread. The caller is
- * responsible for freeing the slot in that case. Otherwise the
- * worker thread will free it.
- */
- if (!F_ISSET(slot, WT_SLOT_FLUSH | WT_SLOT_SYNC_FLAGS)) {
- if (freep != NULL)
- *freep = 0;
- slot->slot_state = WT_LOG_SLOT_WRITTEN;
- /*
- * After this point the worker thread owns the slot. There
- * is nothing more to do but return.
- */
- /*
- * !!! Signalling the wrlsn_cond condition here results in
- * worse performance because it causes more scheduling churn
- * and more walking of the slot pool for a very small number
- * of slots to process. Don't signal here.
- */
- return (0);
- }
-
- /*
- * Wait for earlier groups to finish, otherwise there could
- * be holes in the log file.
- */
- WT_STAT_CONN_INCR(session, log_release_write_lsn);
- __log_wait_for_earlier_slot(session, slot);
-
- log->write_start_lsn = slot->slot_start_lsn;
- log->write_lsn = slot->slot_end_lsn;
-
- WT_ASSERT(session, slot != log->active_slot);
- __wt_cond_signal(session, log->log_write_cond);
- F_CLR(slot, WT_SLOT_FLUSH);
-
- /*
- * Signal the close thread if needed.
- */
- if (F_ISSET(slot, WT_SLOT_CLOSEFH))
- __wt_cond_signal(session, conn->log_file_cond);
-
- if (F_ISSET(slot, WT_SLOT_SYNC_DIRTY) && !F_ISSET(slot, WT_SLOT_SYNC) &&
- (ret = __wt_fsync(session, log->log_fh, false)) != 0) {
- /*
- * Ignore ENOTSUP, but don't try again.
- */
- if (ret != ENOTSUP)
- WT_ERR(ret);
- conn->log_dirty_max = 0;
- }
-
- /*
- * Try to consolidate calls to fsync to wait less. Acquire a spin lock
- * so that threads finishing writing to the log will wait while the
- * current fsync completes and advance log->sync_lsn.
- */
- while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
- /*
- * We have to wait until earlier log files have finished their
- * sync operations. The most recent one will set the LSN to the
- * beginning of our file.
- */
- if (log->sync_lsn.l.file < slot->slot_end_lsn.l.file ||
- __wt_spin_trylock(session, &log->log_sync_lock) != 0) {
- __wt_cond_wait(
- session, log->log_sync_cond, 10000, NULL);
- continue;
- }
- locked = true;
-
- /*
- * Record the current end of our update after the lock.
- * That is how far our calls can guarantee.
- */
- sync_lsn = slot->slot_end_lsn;
- /*
- * Check if we have to sync the parent directory. Some
- * combinations of sync flags may result in the log file
- * not yet stable in its parent directory. Do that
- * now if needed.
- */
- if (F_ISSET(slot, WT_SLOT_SYNC_DIR) &&
- (log->sync_dir_lsn.l.file < sync_lsn.l.file)) {
- WT_ASSERT(session, log->log_dir_fh != NULL);
- __wt_verbose(session, WT_VERB_LOG,
- "log_release: sync directory %s to LSN %" PRIu32
- "/%" PRIu32,
- log->log_dir_fh->name,
- sync_lsn.l.file, sync_lsn.l.offset);
- time_start = __wt_clock(session);
- WT_ERR(__wt_fsync(session, log->log_dir_fh, true));
- time_stop = __wt_clock(session);
- fsync_duration_usecs =
- WT_CLOCKDIFF_US(time_stop, time_start);
- log->sync_dir_lsn = sync_lsn;
- WT_STAT_CONN_INCR(session, log_sync_dir);
- WT_STAT_CONN_INCRV(session,
- log_sync_dir_duration, fsync_duration_usecs);
- }
-
- /*
- * Sync the log file if needed.
- */
- if (F_ISSET(slot, WT_SLOT_SYNC) &&
- __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
- __wt_verbose(session, WT_VERB_LOG,
- "log_release: sync log %s to LSN %" PRIu32
- "/%" PRIu32,
- log->log_fh->name,
- sync_lsn.l.file, sync_lsn.l.offset);
- WT_STAT_CONN_INCR(session, log_sync);
- time_start = __wt_clock(session);
- WT_ERR(__wt_fsync(session, log->log_fh, true));
- time_stop = __wt_clock(session);
- fsync_duration_usecs =
- WT_CLOCKDIFF_US(time_stop, time_start);
- WT_STAT_CONN_INCRV(session,
- log_sync_duration, fsync_duration_usecs);
- log->sync_lsn = sync_lsn;
- __wt_cond_signal(session, log->log_sync_cond);
- }
- /*
- * Clear the flags before leaving the loop.
- */
- F_CLR(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR);
- locked = false;
- __wt_spin_unlock(session, &log->log_sync_lock);
- }
-err: if (locked)
- __wt_spin_unlock(session, &log->log_sync_lock);
- if (ret != 0 && slot->slot_error == 0)
- slot->slot_error = ret;
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LSN sync_lsn;
+ uint64_t fsync_duration_usecs, time_start, time_stop;
+ int64_t release_buffered, release_bytes;
+ bool locked;
+
+ conn = S2C(session);
+ log = conn->log;
+ locked = false;
+ if (freep != NULL)
+ *freep = 1;
+ release_buffered = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state);
+ release_bytes = release_buffered + slot->slot_unbuffered;
+
+ /*
+ * Checkpoints can be configured based on amount of log written. Add in this log record to the
+ * sum and if needed, signal the checkpoint condition. The logging subsystem manages the
+ * accumulated field. There is a bit of layering violation here checking the connection ckpt
+ * field and using its condition.
+ */
+ if (WT_CKPT_LOGSIZE(conn)) {
+ log->log_written += (wt_off_t)release_bytes;
+ __wt_checkpoint_signal(session, log->log_written);
+ }
+
+ /* Write the buffered records */
+ if (release_buffered != 0)
+ WT_ERR(__log_fs_write(
+ session, slot, slot->slot_start_offset, (size_t)release_buffered, slot->slot_buf.mem));
+
+ /*
+ * If we have to wait for a synchronous operation, we do not pass handling of this slot off to
+ * the worker thread. The caller is responsible for freeing the slot in that case. Otherwise the
+ * worker thread will free it.
+ */
+ if (!F_ISSET(slot, WT_SLOT_FLUSH | WT_SLOT_SYNC_FLAGS)) {
+ if (freep != NULL)
+ *freep = 0;
+ slot->slot_state = WT_LOG_SLOT_WRITTEN;
+ /*
+ * After this point the worker thread owns the slot. There is nothing more to do but return.
+ */
+ /*
+ * !!! Signalling the wrlsn_cond condition here results in
+ * worse performance because it causes more scheduling churn
+ * and more walking of the slot pool for a very small number
+ * of slots to process. Don't signal here.
+ */
+ return (0);
+ }
+
+ /*
+ * Wait for earlier groups to finish, otherwise there could be holes in the log file.
+ */
+ WT_STAT_CONN_INCR(session, log_release_write_lsn);
+ __log_wait_for_earlier_slot(session, slot);
+
+ log->write_start_lsn = slot->slot_start_lsn;
+ log->write_lsn = slot->slot_end_lsn;
+
+ WT_ASSERT(session, slot != log->active_slot);
+ __wt_cond_signal(session, log->log_write_cond);
+ F_CLR(slot, WT_SLOT_FLUSH);
+
+ /*
+ * Signal the close thread if needed.
+ */
+ if (F_ISSET(slot, WT_SLOT_CLOSEFH))
+ __wt_cond_signal(session, conn->log_file_cond);
+
+ if (F_ISSET(slot, WT_SLOT_SYNC_DIRTY) && !F_ISSET(slot, WT_SLOT_SYNC) &&
+ (ret = __wt_fsync(session, log->log_fh, false)) != 0) {
+ /*
+ * Ignore ENOTSUP, but don't try again.
+ */
+ if (ret != ENOTSUP)
+ WT_ERR(ret);
+ conn->log_dirty_max = 0;
+ }
+
+ /*
+ * Try to consolidate calls to fsync to wait less. Acquire a spin lock so that threads finishing
+ * writing to the log will wait while the current fsync completes and advance log->sync_lsn.
+ */
+ while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
+ /*
+ * We have to wait until earlier log files have finished their sync operations. The most
+ * recent one will set the LSN to the beginning of our file.
+ */
+ if (log->sync_lsn.l.file < slot->slot_end_lsn.l.file ||
+ __wt_spin_trylock(session, &log->log_sync_lock) != 0) {
+ __wt_cond_wait(session, log->log_sync_cond, 10000, NULL);
+ continue;
+ }
+ locked = true;
+
+ /*
+ * Record the current end of our update after the lock. That is how far our calls can
+ * guarantee.
+ */
+ sync_lsn = slot->slot_end_lsn;
+ /*
+ * Check if we have to sync the parent directory. Some combinations of sync flags may result
+ * in the log file not yet stable in its parent directory. Do that now if needed.
+ */
+ if (F_ISSET(slot, WT_SLOT_SYNC_DIR) && (log->sync_dir_lsn.l.file < sync_lsn.l.file)) {
+ WT_ASSERT(session, log->log_dir_fh != NULL);
+ __wt_verbose(session, WT_VERB_LOG,
+ "log_release: sync directory %s to LSN %" PRIu32 "/%" PRIu32, log->log_dir_fh->name,
+ sync_lsn.l.file, sync_lsn.l.offset);
+ time_start = __wt_clock(session);
+ WT_ERR(__wt_fsync(session, log->log_dir_fh, true));
+ time_stop = __wt_clock(session);
+ fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start);
+ log->sync_dir_lsn = sync_lsn;
+ WT_STAT_CONN_INCR(session, log_sync_dir);
+ WT_STAT_CONN_INCRV(session, log_sync_dir_duration, fsync_duration_usecs);
+ }
+
+ /*
+ * Sync the log file if needed.
+ */
+ if (F_ISSET(slot, WT_SLOT_SYNC) && __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+ __wt_verbose(session, WT_VERB_LOG,
+ "log_release: sync log %s to LSN %" PRIu32 "/%" PRIu32, log->log_fh->name,
+ sync_lsn.l.file, sync_lsn.l.offset);
+ WT_STAT_CONN_INCR(session, log_sync);
+ time_start = __wt_clock(session);
+ WT_ERR(__wt_fsync(session, log->log_fh, true));
+ time_stop = __wt_clock(session);
+ fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start);
+ WT_STAT_CONN_INCRV(session, log_sync_duration, fsync_duration_usecs);
+ log->sync_lsn = sync_lsn;
+ __wt_cond_signal(session, log->log_sync_cond);
+ }
+ /*
+ * Clear the flags before leaving the loop.
+ */
+ F_CLR(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR);
+ locked = false;
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ }
+err:
+ if (locked)
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ if (ret != 0 && slot->slot_error == 0)
+ slot->slot_error = ret;
+ return (ret);
}
/*
* __log_salvage_message --
- * Show messages consistently for a salvageable error.
+ * Show messages consistently for a salvageable error.
*/
static int
-__log_salvage_message(WT_SESSION_IMPL *session, const char *log_name,
- const char *extra_msg, wt_off_t offset)
+__log_salvage_message(
+ WT_SESSION_IMPL *session, const char *log_name, const char *extra_msg, wt_off_t offset)
{
- WT_RET(__wt_msg(session,
- "log file %s corrupted%s at position %" PRIuMAX
- ", truncated", log_name, extra_msg, (uintmax_t)offset));
- F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
- return (WT_ERROR);
+ WT_RET(__wt_msg(session, "log file %s corrupted%s at position %" PRIuMAX ", truncated",
+ log_name, extra_msg, (uintmax_t)offset));
+ F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
+ return (WT_ERROR);
}
/*
* __wt_log_scan --
- * Scan the logs, calling a function on each record found.
+ * Scan the logs, calling a function on each record found.
*/
int
__wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
- int (*func)(WT_SESSION_IMPL *session,
- WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp,
- void *cookie, int firstrecord), void *cookie)
+ int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp,
+ void *cookie, int firstrecord),
+ void *cookie)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_ITEM(buf);
- WT_DECL_ITEM(decryptitem);
- WT_DECL_ITEM(uncitem);
- WT_DECL_RET;
- WT_FH *log_fh;
- WT_ITEM *cbbuf;
- WT_LOG *log;
- WT_LOG_RECORD *logrec;
- WT_LSN end_lsn, next_lsn, prev_eof, prev_lsn, rd_lsn, start_lsn;
- wt_off_t bad_offset, log_size;
- uint32_t allocsize, firstlog, lastlog, lognum, rdup_len, reclen;
- uint16_t version;
- u_int i, logcount;
- int firstrecord;
- char **logfiles;
- bool corrupt, eol, need_salvage, partial_record;
-
- conn = S2C(session);
- log = conn->log;
- log_fh = NULL;
- logcount = 0;
- logfiles = NULL;
- corrupt = eol = false;
- firstrecord = 1;
- need_salvage = false;
-
- /*
- * If the caller did not give us a callback function there is nothing
- * to do.
- */
- if (func == NULL)
- return (0);
-
- if (lsnp != NULL &&
- LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP))
- WT_RET_MSG(session, WT_ERROR,
- "choose either a start LSN or a start flag");
- /*
- * Set up the allocation size, starting and ending LSNs. The values
- * for those depend on whether logging is currently enabled or not.
- */
- lastlog = 0;
- if (log != NULL) {
- allocsize = log->allocsize;
- end_lsn = log->alloc_lsn;
- start_lsn = log->first_lsn;
- if (lsnp == NULL) {
- if (LF_ISSET(WT_LOGSCAN_FROM_CKP))
- start_lsn = log->ckpt_lsn;
- else if (!LF_ISSET(WT_LOGSCAN_FIRST))
- WT_RET_MSG(session, WT_ERROR,
- "WT_LOGSCAN_FIRST not set");
- }
- lastlog = log->fileid;
- } else {
- /*
- * If logging is not configured, we can still print out the log
- * if log files exist. We just need to set the LSNs from what
- * is in the files versus what is in the live connection.
- */
- /*
- * Set allocsize to the minimum alignment it could be. Larger
- * records and larger allocation boundaries should always be
- * a multiple of this.
- */
- allocsize = WT_LOG_ALIGN;
- firstlog = UINT32_MAX;
- WT_RET(__log_get_files(session,
- WT_LOG_FILENAME, &logfiles, &logcount));
- if (logcount == 0)
- WT_RET_MSG(session, ENOTSUP, "no log files found");
- for (i = 0; i < logcount; i++) {
- WT_ERR(__wt_log_extract_lognum(session, logfiles[i],
- &lognum));
- lastlog = WT_MAX(lastlog, lognum);
- firstlog = WT_MIN(firstlog, lognum);
- }
- WT_SET_LSN(&start_lsn, firstlog, 0);
- WT_SET_LSN(&end_lsn, lastlog, 0);
- WT_ERR(
- __wt_fs_directory_list_free(session, &logfiles, logcount));
- }
- if (lsnp != NULL) {
- /*
- * Offsets must be on allocation boundaries.
- * An invalid LSN from a user should just return
- * WT_NOTFOUND. It is not an error. But if it is
- * from recovery, we expect valid LSNs so give more
- * information about that.
- */
- if (lsnp->l.offset % allocsize != 0) {
- if (LF_ISSET(WT_LOGSCAN_RECOVER |
- WT_LOGSCAN_RECOVER_METADATA))
- WT_ERR_MSG(session, WT_NOTFOUND,
- "__wt_log_scan unaligned LSN %"
- PRIu32 "/%" PRIu32,
- lsnp->l.file, lsnp->l.offset);
- else
- WT_ERR(WT_NOTFOUND);
- }
- /*
- * If the file is in the future it doesn't exist.
- * An invalid LSN from a user should just return
- * WT_NOTFOUND. It is not an error. But if it is
- * from recovery, we expect valid LSNs so give more
- * information about that.
- */
- if (lsnp->l.file > lastlog) {
- if (LF_ISSET(WT_LOGSCAN_RECOVER |
- WT_LOGSCAN_RECOVER_METADATA))
- WT_ERR_MSG(session, WT_NOTFOUND,
- "__wt_log_scan LSN %" PRIu32 "/%" PRIu32
- " larger than biggest log file %" PRIu32,
- lsnp->l.file, lsnp->l.offset, lastlog);
- else
- WT_ERR(WT_NOTFOUND);
- }
- /*
- * Log cursors may not know the starting LSN. If an
- * LSN is passed in that it is equal to the smallest
- * LSN, start from the beginning of the log.
- */
- if (!WT_IS_INIT_LSN(lsnp))
- start_lsn = *lsnp;
- }
- WT_ERR(__log_open_verify(session, start_lsn.l.file, &log_fh, &prev_lsn,
- NULL, &need_salvage));
- if (need_salvage)
- WT_ERR_MSG(session, WT_ERROR, "log file requires salvage");
- WT_ERR(__wt_filesize(session, log_fh, &log_size));
- rd_lsn = start_lsn;
- if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA))
- __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS,
- "Recovering log %" PRIu32 " through %" PRIu32,
- rd_lsn.l.file, end_lsn.l.file);
-
- WT_ERR(__wt_scr_alloc(session, WT_LOG_ALIGN, &buf));
- WT_ERR(__wt_scr_alloc(session, 0, &decryptitem));
- WT_ERR(__wt_scr_alloc(session, 0, &uncitem));
- for (;;) {
- if (rd_lsn.l.offset + allocsize > log_size) {
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(buf);
+ WT_DECL_ITEM(decryptitem);
+ WT_DECL_ITEM(uncitem);
+ WT_DECL_RET;
+ WT_FH *log_fh;
+ WT_ITEM *cbbuf;
+ WT_LOG *log;
+ WT_LOG_RECORD *logrec;
+ WT_LSN end_lsn, next_lsn, prev_eof, prev_lsn, rd_lsn, start_lsn;
+ wt_off_t bad_offset, log_size;
+ uint32_t allocsize, firstlog, lastlog, lognum, rdup_len, reclen;
+ uint16_t version;
+ u_int i, logcount;
+ int firstrecord;
+ char **logfiles;
+ bool corrupt, eol, need_salvage, partial_record;
+
+ conn = S2C(session);
+ log = conn->log;
+ log_fh = NULL;
+ logcount = 0;
+ logfiles = NULL;
+ corrupt = eol = false;
+ firstrecord = 1;
+ need_salvage = false;
+
+ /*
+ * If the caller did not give us a callback function there is nothing to do.
+ */
+ if (func == NULL)
+ return (0);
+
+ if (lsnp != NULL && LF_ISSET(WT_LOGSCAN_FIRST | WT_LOGSCAN_FROM_CKP))
+ WT_RET_MSG(session, WT_ERROR, "choose either a start LSN or a start flag");
+ /*
+ * Set up the allocation size, starting and ending LSNs. The values for those depend on whether
+ * logging is currently enabled or not.
+ */
+ lastlog = 0;
+ if (log != NULL) {
+ allocsize = log->allocsize;
+ end_lsn = log->alloc_lsn;
+ start_lsn = log->first_lsn;
+ if (lsnp == NULL) {
+ if (LF_ISSET(WT_LOGSCAN_FROM_CKP))
+ start_lsn = log->ckpt_lsn;
+ else if (!LF_ISSET(WT_LOGSCAN_FIRST))
+ WT_RET_MSG(session, WT_ERROR, "WT_LOGSCAN_FIRST not set");
+ }
+ lastlog = log->fileid;
+ } else {
+ /*
+ * If logging is not configured, we can still print out the log if log files exist. We just
+ * need to set the LSNs from what is in the files versus what is in the live connection.
+ */
+ /*
+ * Set allocsize to the minimum alignment it could be. Larger records and larger allocation
+ * boundaries should always be a multiple of this.
+ */
+ allocsize = WT_LOG_ALIGN;
+ firstlog = UINT32_MAX;
+ WT_RET(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount));
+ if (logcount == 0)
+ WT_RET_MSG(session, ENOTSUP, "no log files found");
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ lastlog = WT_MAX(lastlog, lognum);
+ firstlog = WT_MIN(firstlog, lognum);
+ }
+ WT_SET_LSN(&start_lsn, firstlog, 0);
+ WT_SET_LSN(&end_lsn, lastlog, 0);
+ WT_ERR(__wt_fs_directory_list_free(session, &logfiles, logcount));
+ }
+ if (lsnp != NULL) {
+ /*
+ * Offsets must be on allocation boundaries. An invalid LSN from a user should just return
+ * WT_NOTFOUND. It is not an error. But if it is from recovery, we expect valid LSNs so give
+ * more information about that.
+ */
+ if (lsnp->l.offset % allocsize != 0) {
+ if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA))
+ WT_ERR_MSG(session, WT_NOTFOUND, "__wt_log_scan unaligned LSN %" PRIu32 "/%" PRIu32,
+ lsnp->l.file, lsnp->l.offset);
+ else
+ WT_ERR(WT_NOTFOUND);
+ }
+ /*
+ * If the file is in the future it doesn't exist. An invalid LSN from a user should just
+ * return WT_NOTFOUND. It is not an error. But if it is from recovery, we expect valid LSNs
+ * so give more information about that.
+ */
+ if (lsnp->l.file > lastlog) {
+ if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA))
+ WT_ERR_MSG(session, WT_NOTFOUND,
+ "__wt_log_scan LSN %" PRIu32 "/%" PRIu32 " larger than biggest log file %" PRIu32,
+ lsnp->l.file, lsnp->l.offset, lastlog);
+ else
+ WT_ERR(WT_NOTFOUND);
+ }
+ /*
+ * Log cursors may not know the starting LSN. If an LSN is passed in that it is equal to the
+ * smallest LSN, start from the beginning of the log.
+ */
+ if (!WT_IS_INIT_LSN(lsnp))
+ start_lsn = *lsnp;
+ }
+ WT_ERR(__log_open_verify(session, start_lsn.l.file, &log_fh, &prev_lsn, NULL, &need_salvage));
+ if (need_salvage)
+ WT_ERR_MSG(session, WT_ERROR, "log file requires salvage");
+ WT_ERR(__wt_filesize(session, log_fh, &log_size));
+ rd_lsn = start_lsn;
+ if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA))
+ __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS,
+ "Recovering log %" PRIu32 " through %" PRIu32, rd_lsn.l.file, end_lsn.l.file);
+
+ WT_ERR(__wt_scr_alloc(session, WT_LOG_ALIGN, &buf));
+ WT_ERR(__wt_scr_alloc(session, 0, &decryptitem));
+ WT_ERR(__wt_scr_alloc(session, 0, &uncitem));
+ for (;;) {
+ if (rd_lsn.l.offset + allocsize > log_size) {
advance:
- if (rd_lsn.l.offset == log_size)
- partial_record = false;
- else {
- /*
- * See if there is anything non-zero at the
- * end of this log file.
- */
- WT_ERR(__log_has_hole(
- session, log_fh, log_size,
- rd_lsn.l.offset, &bad_offset,
- &partial_record));
- if (bad_offset != 0) {
- need_salvage = true;
- WT_ERR(__log_salvage_message(session,
- log_fh->name, "", bad_offset));
- }
- }
- /*
- * If we read the last record, go to the next file.
- */
- WT_ERR(__wt_close(session, &log_fh));
- log_fh = NULL;
- eol = true;
- /*
- * Truncate this log file before we move to the next.
- */
- if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
- __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0) {
- __wt_verbose(session, WT_VERB_LOG,
- "Truncate end of log %" PRIu32 "/%" PRIu32,
- rd_lsn.l.file, rd_lsn.l.offset);
- WT_ERR(__log_truncate(session, &rd_lsn, true,
- false));
- }
- /*
- * If we had a partial record, we'll want to break
- * now after closing and truncating. Although for now
- * log_truncate does not modify the LSN passed in,
- * this code does not assume it is unmodified after that
- * call which is why it uses the boolean set earlier.
- */
- if (partial_record)
- break;
- /*
- * Avoid an error message when we reach end of log
- * by checking here.
- */
- prev_eof = rd_lsn;
- WT_SET_LSN(&rd_lsn, rd_lsn.l.file + 1, 0);
- if (rd_lsn.l.file > end_lsn.l.file)
- break;
- if (LF_ISSET(WT_LOGSCAN_RECOVER |
- WT_LOGSCAN_RECOVER_METADATA))
- __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS,
- "Recovering log %" PRIu32
- " through %" PRIu32,
- rd_lsn.l.file, end_lsn.l.file);
- WT_ERR(__log_open_verify(session,
- rd_lsn.l.file, &log_fh, &prev_lsn, &version,
- &need_salvage));
- if (need_salvage)
- WT_ERR_MSG(session, WT_ERROR,
- "log file requires salvage");
- /*
- * Opening the log file reads with verify sets up the
- * previous LSN from the first record. This detects
- * a "hole" at the end of the previous log file.
- */
- if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
- !WT_IS_INIT_LSN(&prev_lsn) &&
- !WT_IS_ZERO_LSN(&prev_lsn) &&
- prev_lsn.l.offset != prev_eof.l.offset) {
- WT_ASSERT(session,
- prev_eof.l.file == prev_lsn.l.file);
- break;
- }
- /*
- * If we read a current version log file without a
- * previous LSN record the log ended after writing
- * that header. We're done.
- */
- if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
- version == WT_LOG_VERSION_SYSTEM &&
- WT_IS_ZERO_LSN(&prev_lsn)) {
- __wt_verbose(session, WT_VERB_LOG,
- "log_scan: Stopping, no system "
- "record detected in %s.", log_fh->name);
- break;
- }
- WT_ERR(__wt_filesize(session, log_fh, &log_size));
- eol = false;
- continue;
- }
- /*
- * Read the minimum allocation size a record could be.
- * Conditionally set the need_salvage flag so that if the
- * read fails, we know this is an situation we can salvage.
- */
- WT_ASSERT(session, buf->memsize >= allocsize);
- need_salvage = F_ISSET(conn, WT_CONN_SALVAGE);
- WT_ERR(__log_fs_read(session,
- log_fh, rd_lsn.l.offset, (size_t)allocsize, buf->mem));
- need_salvage = false;
- /*
- * See if we need to read more than the allocation size. We
- * expect that we rarely will have to read more. Most log
- * records will be fairly small.
- */
- reclen = ((WT_LOG_RECORD *)buf->mem)->len;
+ if (rd_lsn.l.offset == log_size)
+ partial_record = false;
+ else {
+ /*
+ * See if there is anything non-zero at the end of this log file.
+ */
+ WT_ERR(__log_has_hole(
+ session, log_fh, log_size, rd_lsn.l.offset, &bad_offset, &partial_record));
+ if (bad_offset != 0) {
+ need_salvage = true;
+ WT_ERR(__log_salvage_message(session, log_fh->name, "", bad_offset));
+ }
+ }
+ /*
+ * If we read the last record, go to the next file.
+ */
+ WT_ERR(__wt_close(session, &log_fh));
+ log_fh = NULL;
+ eol = true;
+ /*
+ * Truncate this log file before we move to the next.
+ */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER) && __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0) {
+ __wt_verbose(session, WT_VERB_LOG, "Truncate end of log %" PRIu32 "/%" PRIu32,
+ rd_lsn.l.file, rd_lsn.l.offset);
+ WT_ERR(__log_truncate(session, &rd_lsn, true, false));
+ }
+ /*
+ * If we had a partial record, we'll want to break now after closing and truncating.
+ * Although for now log_truncate does not modify the LSN passed in, this code does not
+ * assume it is unmodified after that call which is why it uses the boolean set earlier.
+ */
+ if (partial_record)
+ break;
+ /*
+ * Avoid an error message when we reach end of log by checking here.
+ */
+ prev_eof = rd_lsn;
+ WT_SET_LSN(&rd_lsn, rd_lsn.l.file + 1, 0);
+ if (rd_lsn.l.file > end_lsn.l.file)
+ break;
+ if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA))
+ __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS,
+ "Recovering log %" PRIu32 " through %" PRIu32, rd_lsn.l.file, end_lsn.l.file);
+ WT_ERR(__log_open_verify(
+ session, rd_lsn.l.file, &log_fh, &prev_lsn, &version, &need_salvage));
+ if (need_salvage)
+ WT_ERR_MSG(session, WT_ERROR, "log file requires salvage");
+ /*
+ * Opening the log file reads with verify sets up the previous LSN from the first
+ * record. This detects a "hole" at the end of the previous log file.
+ */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER) && !WT_IS_INIT_LSN(&prev_lsn) &&
+ !WT_IS_ZERO_LSN(&prev_lsn) && prev_lsn.l.offset != prev_eof.l.offset) {
+ WT_ASSERT(session, prev_eof.l.file == prev_lsn.l.file);
+ break;
+ }
+ /*
+ * If we read a current version log file without a previous LSN record the log ended
+ * after writing that header. We're done.
+ */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER) && version == WT_LOG_VERSION_SYSTEM &&
+ WT_IS_ZERO_LSN(&prev_lsn)) {
+ __wt_verbose(session, WT_VERB_LOG,
+ "log_scan: Stopping, no system "
+ "record detected in %s.",
+ log_fh->name);
+ break;
+ }
+ WT_ERR(__wt_filesize(session, log_fh, &log_size));
+ eol = false;
+ continue;
+ }
+ /*
+ * Read the minimum allocation size a record could be. Conditionally set the need_salvage
+ * flag so that if the read fails, we know this is an situation we can salvage.
+ */
+ WT_ASSERT(session, buf->memsize >= allocsize);
+ need_salvage = F_ISSET(conn, WT_CONN_SALVAGE);
+ WT_ERR(__log_fs_read(session, log_fh, rd_lsn.l.offset, (size_t)allocsize, buf->mem));
+ need_salvage = false;
+ /*
+ * See if we need to read more than the allocation size. We expect that we rarely will have
+ * to read more. Most log records will be fairly small.
+ */
+ reclen = ((WT_LOG_RECORD *)buf->mem)->len;
#ifdef WORDS_BIGENDIAN
- reclen = __wt_bswap32(reclen);
+ reclen = __wt_bswap32(reclen);
#endif
- /*
- * Log files are pre-allocated. We need to detect the
- * difference between a hole in the file (where this location
- * would be considered the end of log) and the last record
- * in the log and we're at the zeroed part of the file.
- * If we find a zeroed record, scan forward in the log looking
- * for any data. If we detect any we have a hole and stop.
- * Otherwise if the rest is all zeroes advance to the next file.
- * When recovery finds the end of the log, truncate the file
- * and remove any later log files that may exist.
- */
- if (reclen == 0) {
- WT_ERR(__log_has_hole(
- session, log_fh, log_size, rd_lsn.l.offset,
- &bad_offset, &eol));
- if (bad_offset != 0) {
- need_salvage = true;
- WT_ERR(__log_salvage_message(session,
- log_fh->name, "", bad_offset));
- }
- if (eol)
- /* Found a hole. This LSN is the end. */
- break;
- /* Last record in log. Look for more. */
- goto advance;
- }
- rdup_len = __wt_rduppo2(reclen, allocsize);
- if (reclen > allocsize) {
- /*
- * The log file end could be the middle of this
- * log record. If we have a partially written record
- * then this is considered the end of the log.
- */
- if (rd_lsn.l.offset + rdup_len > log_size) {
- eol = true;
- break;
- }
- /*
- * We need to round up and read in the full padded
- * record, especially for direct I/O.
- */
- WT_ERR(__wt_buf_grow(session, buf, rdup_len));
- WT_ERR(__log_fs_read(session, log_fh,
- rd_lsn.l.offset, (size_t)rdup_len, buf->mem));
- WT_STAT_CONN_INCR(session, log_scan_rereads);
- }
- /*
- * We read in the record, now verify the checksum. A failed
- * checksum does not imply corruption, it may be the result
- * of a partial write.
- */
- buf->size = reclen;
- logrec = (WT_LOG_RECORD *)buf->mem;
- if (!__log_checksum_match(buf, reclen)) {
- /*
- * A checksum mismatch means we have reached the end of
- * the useful part of the log. This should be found on
- * the first pass through recovery. In the second pass
- * where we truncate the log, this is where it should
- * end.
- * Continue processing where possible, so remember any
- * error returns, but don't skip to the error handler.
- */
- if (log != NULL)
- log->trunc_lsn = rd_lsn;
- /*
- * If the user asked for a specific LSN and it is not
- * a valid LSN, return WT_NOTFOUND.
- */
- if (LF_ISSET(WT_LOGSCAN_ONE))
- ret = WT_NOTFOUND;
-
- /*
- * When we have a checksum mismatch, we would like
- * to determine whether it may be the result of:
- * 1) some expected corruption that can occur during
- * backups
- * 2) a partial write that can naturally occur when
- * an application crashes
- * 3) some other corruption
- * so that we can (in case 3) flag cases of file system
- * or hardware failures. Unfortunately, we have found
- * on some systems that file system writes may in fact
- * be lost, and this can readily be triggered with
- * normal operations. Rather than force users to
- * salvage in these situations, we merely truncate the
- * log at this point and issue a message.
- */
- if (F_ISSET(conn, WT_CONN_WAS_BACKUP))
- break;
-
- if (!__log_check_partial_write(session, buf, reclen)) {
- /*
- * It's not a partial write, and we have a bad
- * checksum. We treat it as a corruption that
- * must be salvaged.
- */
- need_salvage = true;
- WT_TRET(__log_salvage_message(session,
- log_fh->name, ", bad checksum",
- rd_lsn.l.offset));
- } else {
- /*
- * It may be a partial write, or it's possible
- * that the header is corrupt. Make a sanity
- * check of the log record header.
- */
- WT_TRET(__log_record_verify(session, log_fh,
- rd_lsn.l.offset, logrec, &corrupt));
- if (corrupt) {
- need_salvage = true;
- WT_TRET(__log_salvage_message(session,
- log_fh->name, "", rd_lsn.l.offset));
- }
- }
- break;
- }
- __wt_log_record_byteswap(logrec);
-
- /*
- * We have a valid log record. If it is not the log file
- * header, invoke the callback.
- */
- WT_STAT_CONN_INCR(session, log_scan_records);
- next_lsn = rd_lsn;
- next_lsn.l.offset += rdup_len;
- if (rd_lsn.l.offset != 0) {
- /*
- * We need to manage the different buffers here.
- * Buf is the buffer this function uses to read from
- * the disk. The callback buffer may change based
- * on whether encryption and compression are used.
- *
- * We want to free any buffers from compression and
- * encryption but keep the one we use for reading.
- */
- cbbuf = buf;
- if (F_ISSET(logrec, WT_LOG_RECORD_ENCRYPTED)) {
- WT_ERR(__log_decrypt(
- session, cbbuf, decryptitem));
- cbbuf = decryptitem;
- }
- if (F_ISSET(logrec, WT_LOG_RECORD_COMPRESSED)) {
- WT_ERR(__log_decompress(
- session, cbbuf, uncitem));
- cbbuf = uncitem;
- }
- WT_ERR((*func)(session,
- cbbuf, &rd_lsn, &next_lsn, cookie, firstrecord));
-
- firstrecord = 0;
-
- if (LF_ISSET(WT_LOGSCAN_ONE))
- break;
- }
- rd_lsn = next_lsn;
- }
-
- /* Truncate if we're in recovery. */
- if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
- __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0) {
- __wt_verbose(session, WT_VERB_LOG,
- "End of recovery truncate end of log %" PRIu32 "/%" PRIu32,
- rd_lsn.l.file, rd_lsn.l.offset);
- /* Preserve prior error and fall through to error handling. */
- WT_TRET(__log_truncate(session, &rd_lsn, false, false));
- }
-
-err: WT_STAT_CONN_INCR(session, log_scans);
- /*
- * If we are salvaging and failed a salvageable operation, then
- * truncate the log at the fail point.
- */
- if (ret != 0 && ret != WT_PANIC && need_salvage) {
- WT_TRET(__wt_close(session, &log_fh));
- log_fh = NULL;
- WT_TRET(__log_truncate(session, &rd_lsn, false, true));
- ret = 0;
- }
-
- /*
- * If the first attempt to read a log record results in
- * an error recovery is likely going to fail. Try to provide
- * a helpful failure message.
- */
- if (ret != 0 && firstrecord && LF_ISSET(WT_LOGSCAN_RECOVER |
- WT_LOGSCAN_RECOVER_METADATA)) {
- __wt_err(session, ret,
- "WiredTiger is unable to read the recovery log.");
- __wt_err(session, ret, "This may be due to the log"
- " files being encrypted, being from an older"
- " version or due to corruption on disk");
- __wt_err(session, ret, "You should confirm that you have"
- " opened the database with the correct options including"
- " all encryption and compression options");
- }
-
- WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
-
- __wt_scr_free(session, &buf);
- __wt_scr_free(session, &decryptitem);
- __wt_scr_free(session, &uncitem);
-
- /*
- * If the caller wants one record and it is at the end of log,
- * return WT_NOTFOUND.
- */
- if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0)
- ret = WT_NOTFOUND;
- WT_TRET(__wt_close(session, &log_fh));
- return (ret);
+ /*
+ * Log files are pre-allocated. We need to detect the difference between a hole in the file
+ * (where this location would be considered the end of log) and the last record in the log
+ * and we're at the zeroed part of the file. If we find a zeroed record, scan forward in the
+ * log looking for any data. If we detect any we have a hole and stop. Otherwise if the rest
+ * is all zeroes advance to the next file. When recovery finds the end of the log, truncate
+ * the file and remove any later log files that may exist.
+ */
+ if (reclen == 0) {
+ WT_ERR(__log_has_hole(session, log_fh, log_size, rd_lsn.l.offset, &bad_offset, &eol));
+ if (bad_offset != 0) {
+ need_salvage = true;
+ WT_ERR(__log_salvage_message(session, log_fh->name, "", bad_offset));
+ }
+ if (eol)
+ /* Found a hole. This LSN is the end. */
+ break;
+ /* Last record in log. Look for more. */
+ goto advance;
+ }
+ rdup_len = __wt_rduppo2(reclen, allocsize);
+ if (reclen > allocsize) {
+ /*
+ * The log file end could be the middle of this log record. If we have a partially
+ * written record then this is considered the end of the log.
+ */
+ if (rd_lsn.l.offset + rdup_len > log_size) {
+ eol = true;
+ break;
+ }
+ /*
+ * We need to round up and read in the full padded record, especially for direct I/O.
+ */
+ WT_ERR(__wt_buf_grow(session, buf, rdup_len));
+ WT_ERR(__log_fs_read(session, log_fh, rd_lsn.l.offset, (size_t)rdup_len, buf->mem));
+ WT_STAT_CONN_INCR(session, log_scan_rereads);
+ }
+ /*
+ * We read in the record, now verify the checksum. A failed checksum does not imply
+ * corruption, it may be the result of a partial write.
+ */
+ buf->size = reclen;
+ logrec = (WT_LOG_RECORD *)buf->mem;
+ if (!__log_checksum_match(buf, reclen)) {
+ /*
+ * A checksum mismatch means we have reached the end of the useful part of the log. This
+ * should be found on the first pass through recovery. In the second pass where we
+ * truncate the log, this is where it should end. Continue processing where possible, so
+ * remember any error returns, but don't skip to the error handler.
+ */
+ if (log != NULL)
+ log->trunc_lsn = rd_lsn;
+ /*
+ * If the user asked for a specific LSN and it is not a valid LSN, return WT_NOTFOUND.
+ */
+ if (LF_ISSET(WT_LOGSCAN_ONE))
+ ret = WT_NOTFOUND;
+
+ /*
+ * When we have a checksum mismatch, we would like
+ * to determine whether it may be the result of:
+ * 1) some expected corruption that can occur during
+ * backups
+ * 2) a partial write that can naturally occur when
+ * an application crashes
+ * 3) some other corruption
+ * so that we can (in case 3) flag cases of file system
+ * or hardware failures. Unfortunately, we have found
+ * on some systems that file system writes may in fact
+ * be lost, and this can readily be triggered with
+ * normal operations. Rather than force users to
+ * salvage in these situations, we merely truncate the
+ * log at this point and issue a message.
+ */
+ if (F_ISSET(conn, WT_CONN_WAS_BACKUP))
+ break;
+
+ if (!__log_check_partial_write(session, buf, reclen)) {
+ /*
+ * It's not a partial write, and we have a bad checksum. We treat it as a corruption
+ * that must be salvaged.
+ */
+ need_salvage = true;
+ WT_TRET(
+ __log_salvage_message(session, log_fh->name, ", bad checksum", rd_lsn.l.offset));
+ } else {
+ /*
+ * It may be a partial write, or it's possible that the header is corrupt. Make a
+ * sanity check of the log record header.
+ */
+ WT_TRET(__log_record_verify(session, log_fh, rd_lsn.l.offset, logrec, &corrupt));
+ if (corrupt) {
+ need_salvage = true;
+ WT_TRET(__log_salvage_message(session, log_fh->name, "", rd_lsn.l.offset));
+ }
+ }
+ break;
+ }
+ __wt_log_record_byteswap(logrec);
+
+ /*
+ * We have a valid log record. If it is not the log file header, invoke the callback.
+ */
+ WT_STAT_CONN_INCR(session, log_scan_records);
+ next_lsn = rd_lsn;
+ next_lsn.l.offset += rdup_len;
+ if (rd_lsn.l.offset != 0) {
+ /*
+ * We need to manage the different buffers here.
+ * Buf is the buffer this function uses to read from
+ * the disk. The callback buffer may change based
+ * on whether encryption and compression are used.
+ *
+ * We want to free any buffers from compression and
+ * encryption but keep the one we use for reading.
+ */
+ cbbuf = buf;
+ if (F_ISSET(logrec, WT_LOG_RECORD_ENCRYPTED)) {
+ WT_ERR(__log_decrypt(session, cbbuf, decryptitem));
+ cbbuf = decryptitem;
+ }
+ if (F_ISSET(logrec, WT_LOG_RECORD_COMPRESSED)) {
+ WT_ERR(__log_decompress(session, cbbuf, uncitem));
+ cbbuf = uncitem;
+ }
+ WT_ERR((*func)(session, cbbuf, &rd_lsn, &next_lsn, cookie, firstrecord));
+
+ firstrecord = 0;
+
+ if (LF_ISSET(WT_LOGSCAN_ONE))
+ break;
+ }
+ rd_lsn = next_lsn;
+ }
+
+ /* Truncate if we're in recovery. */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER) && __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0) {
+ __wt_verbose(session, WT_VERB_LOG,
+ "End of recovery truncate end of log %" PRIu32 "/%" PRIu32, rd_lsn.l.file,
+ rd_lsn.l.offset);
+ /* Preserve prior error and fall through to error handling. */
+ WT_TRET(__log_truncate(session, &rd_lsn, false, false));
+ }
+
+err:
+ WT_STAT_CONN_INCR(session, log_scans);
+ /*
+ * If we are salvaging and failed a salvageable operation, then truncate the log at the fail
+ * point.
+ */
+ if (ret != 0 && ret != WT_PANIC && need_salvage) {
+ WT_TRET(__wt_close(session, &log_fh));
+ log_fh = NULL;
+ WT_TRET(__log_truncate(session, &rd_lsn, false, true));
+ ret = 0;
+ }
+
+ /*
+ * If the first attempt to read a log record results in an error recovery is likely going to
+ * fail. Try to provide a helpful failure message.
+ */
+ if (ret != 0 && firstrecord && LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA)) {
+ __wt_err(session, ret, "WiredTiger is unable to read the recovery log");
+ __wt_err(session, ret,
+ "This may be due to the log"
+ " files being encrypted, being from an older"
+ " version or due to corruption on disk");
+ __wt_err(session, ret,
+ "You should confirm that you have"
+ " opened the database with the correct options including"
+ " all encryption and compression options");
+ }
+
+ WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
+
+ __wt_scr_free(session, &buf);
+ __wt_scr_free(session, &decryptitem);
+ __wt_scr_free(session, &uncitem);
+
+ /*
+ * If the caller wants one record and it is at the end of log, return WT_NOTFOUND.
+ */
+ if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0)
+ ret = WT_NOTFOUND;
+ WT_TRET(__wt_close(session, &log_fh));
+ return (ret);
}
/*
* __wt_log_force_write --
- * Force a switch and release and write of the current slot.
- * Wrapper function that takes the lock.
+ * Force a switch and release and write of the current slot. Wrapper function that takes the
+ * lock.
*/
int
__wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work)
{
- WT_LOG *log;
- WT_MYSLOT myslot;
-
- log = S2C(session)->log;
- memset(&myslot, 0, sizeof(myslot));
- WT_STAT_CONN_INCR(session, log_force_write);
- if (did_work != NULL)
- *did_work = true;
- myslot.slot = log->active_slot;
- return (__wt_log_slot_switch(session, &myslot, retry, true, did_work));
+ WT_LOG *log;
+ WT_MYSLOT myslot;
+
+ log = S2C(session)->log;
+ memset(&myslot, 0, sizeof(myslot));
+ WT_STAT_CONN_INCR(session, log_force_write);
+ if (did_work != NULL)
+ *did_work = true;
+ myslot.slot = log->active_slot;
+ return (__wt_log_slot_switch(session, &myslot, retry, true, did_work));
}
/*
* __wt_log_write --
- * Write a record into the log, compressing as necessary.
+ * Write a record into the log, compressing as necessary.
*/
int
-__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
- uint32_t flags)
+__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags)
{
- WT_COMPRESSOR *compressor;
- WT_CONNECTION_IMPL *conn;
- WT_DECL_ITEM(citem);
- WT_DECL_ITEM(eitem);
- WT_DECL_RET;
- WT_ITEM *ip;
- WT_KEYED_ENCRYPTOR *kencryptor;
- WT_LOG *log;
- WT_LOG_RECORD *newlrp;
- size_t dst_len, len, new_size, result_len, src_len;
- uint8_t *dst, *src;
- int compression_failed;
-
- conn = S2C(session);
- log = conn->log;
- /*
- * An error during opening the logging subsystem can result in it
- * being enabled, but without an open log file. In that case,
- * just return. We can also have logging opened for reading in a
- * read-only database and attempt to write a record on close.
- */
- if (!F_ISSET(log, WT_LOG_OPENED) || F_ISSET(conn, WT_CONN_READONLY))
- return (0);
- ip = record;
- if ((compressor = conn->log_compressor) != NULL &&
- record->size < log->allocsize) {
- WT_STAT_CONN_INCR(session, log_compress_small);
- } else if (compressor != NULL) {
- /* Skip the log header */
- src = (uint8_t *)record->mem + WT_LOG_COMPRESS_SKIP;
- src_len = record->size - WT_LOG_COMPRESS_SKIP;
-
- /*
- * Compute the size needed for the destination buffer. We only
- * allocate enough memory for a copy of the original by default,
- * if any compressed version is bigger than the original, we
- * won't use it. However, some compression engines (snappy is
- * one example), may need more memory because they don't stop
- * just because there's no more memory into which to compress.
- */
- if (compressor->pre_size == NULL)
- len = src_len;
- else
- WT_ERR(compressor->pre_size(compressor,
- &session->iface, src, src_len, &len));
-
- new_size = len + WT_LOG_COMPRESS_SKIP;
- WT_ERR(__wt_scr_alloc(session, new_size, &citem));
-
- /* Skip the header bytes of the destination data. */
- dst = (uint8_t *)citem->mem + WT_LOG_COMPRESS_SKIP;
- dst_len = len;
-
- compression_failed = 0;
- WT_ERR(compressor->compress(compressor, &session->iface,
- src, src_len, dst, dst_len, &result_len,
- &compression_failed));
- result_len += WT_LOG_COMPRESS_SKIP;
-
- /*
- * If compression fails, or doesn't gain us at least one unit of
- * allocation, fallback to the original version. This isn't
- * unexpected: if compression doesn't work for some chunk of
- * data for some reason (noting likely additional format/header
- * information which compressed output requires), it just means
- * the uncompressed version is as good as it gets, and that's
- * what we use.
- */
- if (compression_failed ||
- result_len / log->allocsize >=
- record->size / log->allocsize)
- WT_STAT_CONN_INCR(session, log_compress_write_fails);
- else {
- WT_STAT_CONN_INCR(session, log_compress_writes);
- WT_STAT_CONN_INCRV(session, log_compress_mem,
- record->size);
- WT_STAT_CONN_INCRV(session, log_compress_len,
- result_len);
-
- /*
- * Copy in the skipped header bytes, set the final data
- * size.
- */
- memcpy(citem->mem, record->mem, WT_LOG_COMPRESS_SKIP);
- citem->size = result_len;
- ip = citem;
- newlrp = (WT_LOG_RECORD *)citem->mem;
- F_SET(newlrp, WT_LOG_RECORD_COMPRESSED);
- WT_ASSERT(session, result_len < UINT32_MAX &&
- record->size < UINT32_MAX);
- newlrp->mem_len = WT_STORE_SIZE(record->size);
- }
- }
- if ((kencryptor = conn->kencryptor) != NULL) {
- /*
- * Allocate enough space for the original record plus the
- * encryption size constant plus the length we store.
- */
- __wt_encrypt_size(session, kencryptor, ip->size, &new_size);
- WT_ERR(__wt_scr_alloc(session, new_size, &eitem));
-
- WT_ERR(__wt_encrypt(session, kencryptor,
- WT_LOG_ENCRYPT_SKIP, ip, eitem));
-
- /*
- * Final setup of new buffer. Set the flag for
- * encryption in the record header.
- */
- ip = eitem;
- newlrp = (WT_LOG_RECORD *)eitem->mem;
- F_SET(newlrp, WT_LOG_RECORD_ENCRYPTED);
- WT_ASSERT(session, new_size < UINT32_MAX &&
- ip->size < UINT32_MAX);
- }
- ret = __log_write_internal(session, ip, lsnp, flags);
-
-err: __wt_scr_free(session, &citem);
- __wt_scr_free(session, &eitem);
- return (ret);
+ WT_COMPRESSOR *compressor;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(citem);
+ WT_DECL_ITEM(eitem);
+ WT_DECL_RET;
+ WT_ITEM *ip;
+ WT_KEYED_ENCRYPTOR *kencryptor;
+ WT_LOG *log;
+ WT_LOG_RECORD *newlrp;
+ size_t dst_len, len, new_size, result_len, src_len;
+ uint8_t *dst, *src;
+ int compression_failed;
+
+ conn = S2C(session);
+ log = conn->log;
+ /*
+ * An error during opening the logging subsystem can result in it being enabled, but without an
+ * open log file. In that case, just return. We can also have logging opened for reading in a
+ * read-only database and attempt to write a record on close.
+ */
+ if (!F_ISSET(log, WT_LOG_OPENED) || F_ISSET(conn, WT_CONN_READONLY))
+ return (0);
+ ip = record;
+ if ((compressor = conn->log_compressor) != NULL && record->size < log->allocsize) {
+ WT_STAT_CONN_INCR(session, log_compress_small);
+ } else if (compressor != NULL) {
+ /* Skip the log header */
+ src = (uint8_t *)record->mem + WT_LOG_COMPRESS_SKIP;
+ src_len = record->size - WT_LOG_COMPRESS_SKIP;
+
+ /*
+ * Compute the size needed for the destination buffer. We only allocate enough memory for a
+ * copy of the original by default, if any compressed version is bigger than the original,
+ * we won't use it. However, some compression engines (snappy is one example), may need more
+ * memory because they don't stop just because there's no more memory into which to
+ * compress.
+ */
+ if (compressor->pre_size == NULL)
+ len = src_len;
+ else
+ WT_ERR(compressor->pre_size(compressor, &session->iface, src, src_len, &len));
+
+ new_size = len + WT_LOG_COMPRESS_SKIP;
+ WT_ERR(__wt_scr_alloc(session, new_size, &citem));
+
+ /* Skip the header bytes of the destination data. */
+ dst = (uint8_t *)citem->mem + WT_LOG_COMPRESS_SKIP;
+ dst_len = len;
+
+ compression_failed = 0;
+ WT_ERR(compressor->compress(compressor, &session->iface, src, src_len, dst, dst_len,
+ &result_len, &compression_failed));
+ result_len += WT_LOG_COMPRESS_SKIP;
+
+ /*
+ * If compression fails, or doesn't gain us at least one unit of allocation, fallback to the
+ * original version. This isn't unexpected: if compression doesn't work for some chunk of
+ * data for some reason (noting likely additional format/header information which compressed
+ * output requires), it just means the uncompressed version is as good as it gets, and
+ * that's what we use.
+ */
+ if (compression_failed || result_len / log->allocsize >= record->size / log->allocsize)
+ WT_STAT_CONN_INCR(session, log_compress_write_fails);
+ else {
+ WT_STAT_CONN_INCR(session, log_compress_writes);
+ WT_STAT_CONN_INCRV(session, log_compress_mem, record->size);
+ WT_STAT_CONN_INCRV(session, log_compress_len, result_len);
+
+ /*
+ * Copy in the skipped header bytes, set the final data size.
+ */
+ memcpy(citem->mem, record->mem, WT_LOG_COMPRESS_SKIP);
+ citem->size = result_len;
+ ip = citem;
+ newlrp = (WT_LOG_RECORD *)citem->mem;
+ F_SET(newlrp, WT_LOG_RECORD_COMPRESSED);
+ WT_ASSERT(session, result_len < UINT32_MAX && record->size < UINT32_MAX);
+ newlrp->mem_len = WT_STORE_SIZE(record->size);
+ }
+ }
+ if ((kencryptor = conn->kencryptor) != NULL) {
+ /*
+ * Allocate enough space for the original record plus the encryption size constant plus the
+ * length we store.
+ */
+ __wt_encrypt_size(session, kencryptor, ip->size, &new_size);
+ WT_ERR(__wt_scr_alloc(session, new_size, &eitem));
+
+ WT_ERR(__wt_encrypt(session, kencryptor, WT_LOG_ENCRYPT_SKIP, ip, eitem));
+
+ /*
+ * Final setup of new buffer. Set the flag for encryption in the record header.
+ */
+ ip = eitem;
+ newlrp = (WT_LOG_RECORD *)eitem->mem;
+ F_SET(newlrp, WT_LOG_RECORD_ENCRYPTED);
+ WT_ASSERT(session, new_size < UINT32_MAX && ip->size < UINT32_MAX);
+ }
+ ret = __log_write_internal(session, ip, lsnp, flags);
+
+err:
+ __wt_scr_free(session, &citem);
+ __wt_scr_free(session, &eitem);
+ return (ret);
}
/*
* __log_write_internal --
- * Write a record into the log.
+ * Write a record into the log.
*/
static int
-__log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
- uint32_t flags)
+__log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_LOG *log;
- WT_LOG_RECORD *logrec;
- WT_LSN lsn;
- WT_MYSLOT myslot;
- int64_t release_size;
- uint32_t fill_size, force, rdup_len;
- bool free_slot;
-
- conn = S2C(session);
- log = conn->log;
- if (record->size > UINT32_MAX)
- WT_RET_MSG(session, EFBIG,
- "Log record size of %" WT_SIZET_FMT " exceeds the maximum "
- "supported size of %" PRIu32,
- record->size, UINT32_MAX);
- WT_INIT_LSN(&lsn);
- myslot.slot = NULL;
- memset(&myslot, 0, sizeof(myslot));
- /*
- * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a
- * header at the beginning for us to fill in.
- *
- * If using direct_io, the caller should pass us an aligned record.
- * But we need to make sure it is big enough and zero-filled so
- * that we can write the full amount. Do this whether or not
- * direct_io is in use because it makes the reading code cleaner.
- */
- WT_STAT_CONN_INCRV(session, log_bytes_payload, record->size);
- rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize);
- WT_ERR(__wt_buf_grow(session, record, rdup_len));
- WT_ASSERT(session, record->data == record->mem);
- /*
- * If the caller's record only partially fills the necessary
- * space, we need to zero-fill the remainder.
- *
- * The cast is safe, we've already checked to make sure it's in range.
- */
- fill_size = rdup_len - (uint32_t)record->size;
- if (fill_size != 0) {
- memset((uint8_t *)record->mem + record->size, 0, fill_size);
- /*
- * Set the last byte of the log record to a non-zero value,
- * that allows us, on the input side, to tell that a log
- * record was completely written; there couldn't have been
- * a partial write. That means that any checksum mismatch
- * in those conditions is a log corruption.
- *
- * Without this changed byte, when we see a zeroed last byte,
- * we must always treat a checksum error as a possible partial
- * write. Since partial writes can happen as a result of an
- * interrupted process (for example, a shutdown), we must
- * treat a checksum error as a normal occurrence, and merely
- * the place where the log must be truncated. So any real
- * corruption within log records is hard to detect as such.
- *
- * However, we can only make this modification if there is
- * more than one byte being filled, as the first zero byte
- * past the actual record is needed to terminate the loop
- * in txn_commit_apply.
- *
- * This is not a log format change, as we only are changing a
- * byte in the padding portion of a record, and no logging code
- * has ever checked that it is any particular value up to now.
- */
- if (fill_size > 1)
- *((uint8_t *)record->mem + rdup_len - 1) =
- WT_DEBUG_BYTE;
- record->size = rdup_len;
- }
- /*
- * Checksum a little-endian version of the header, and write everything
- * in little-endian format. The checksum is (potentially) returned in a
- * big-endian format, swap it into place in a separate step.
- */
- logrec = (WT_LOG_RECORD *)record->mem;
- logrec->len = (uint32_t)record->size;
- logrec->checksum = 0;
- __wt_log_record_byteswap(logrec);
- logrec->checksum = __wt_checksum(logrec, record->size);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOG_RECORD *logrec;
+ WT_LSN lsn;
+ WT_MYSLOT myslot;
+ int64_t release_size;
+ uint32_t fill_size, force, rdup_len;
+ bool free_slot;
+
+ conn = S2C(session);
+ log = conn->log;
+ if (record->size > UINT32_MAX)
+ WT_RET_MSG(session, EFBIG, "Log record size of %" WT_SIZET_FMT
+ " exceeds the maximum "
+ "supported size of %" PRIu32,
+ record->size, UINT32_MAX);
+ WT_INIT_LSN(&lsn);
+ myslot.slot = NULL;
+ memset(&myslot, 0, sizeof(myslot));
+ /*
+ * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a
+ * header at the beginning for us to fill in.
+ *
+ * If using direct_io, the caller should pass us an aligned record.
+ * But we need to make sure it is big enough and zero-filled so
+ * that we can write the full amount. Do this whether or not
+ * direct_io is in use because it makes the reading code cleaner.
+ */
+ WT_STAT_CONN_INCRV(session, log_bytes_payload, record->size);
+ rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize);
+ WT_ERR(__wt_buf_grow(session, record, rdup_len));
+ WT_ASSERT(session, record->data == record->mem);
+ /*
+ * If the caller's record only partially fills the necessary
+ * space, we need to zero-fill the remainder.
+ *
+ * The cast is safe, we've already checked to make sure it's in range.
+ */
+ fill_size = rdup_len - (uint32_t)record->size;
+ if (fill_size != 0) {
+ memset((uint8_t *)record->mem + record->size, 0, fill_size);
+ /*
+ * Set the last byte of the log record to a non-zero value,
+ * that allows us, on the input side, to tell that a log
+ * record was completely written; there couldn't have been
+ * a partial write. That means that any checksum mismatch
+ * in those conditions is a log corruption.
+ *
+ * Without this changed byte, when we see a zeroed last byte,
+ * we must always treat a checksum error as a possible partial
+ * write. Since partial writes can happen as a result of an
+ * interrupted process (for example, a shutdown), we must
+ * treat a checksum error as a normal occurrence, and merely
+ * the place where the log must be truncated. So any real
+ * corruption within log records is hard to detect as such.
+ *
+ * However, we can only make this modification if there is
+ * more than one byte being filled, as the first zero byte
+ * past the actual record is needed to terminate the loop
+ * in txn_commit_apply.
+ *
+ * This is not a log format change, as we only are changing a
+ * byte in the padding portion of a record, and no logging code
+ * has ever checked that it is any particular value up to now.
+ */
+ if (fill_size > 1)
+ *((uint8_t *)record->mem + rdup_len - 1) = WT_DEBUG_BYTE;
+ record->size = rdup_len;
+ }
+ /*
+ * Checksum a little-endian version of the header, and write everything in little-endian format.
+ * The checksum is (potentially) returned in a big-endian format, swap it into place in a
+ * separate step.
+ */
+ logrec = (WT_LOG_RECORD *)record->mem;
+ logrec->len = (uint32_t)record->size;
+ logrec->checksum = 0;
+ __wt_log_record_byteswap(logrec);
+ logrec->checksum = __wt_checksum(logrec, record->size);
#ifdef WORDS_BIGENDIAN
- logrec->checksum = __wt_bswap32(logrec->checksum);
+ logrec->checksum = __wt_bswap32(logrec->checksum);
#endif
- WT_STAT_CONN_INCR(session, log_writes);
-
- /*
- * The only time joining a slot should ever return an error is if it
- * detects a panic.
- */
- __wt_log_slot_join(session, rdup_len, flags, &myslot);
- /*
- * If the addition of this record crosses the buffer boundary,
- * switch in a new slot.
- */
- force = LF_ISSET(WT_LOG_FLUSH | WT_LOG_FSYNC);
- ret = 0;
- if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX ||
- F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force)
- ret = __wt_log_slot_switch(session, &myslot, true, false, NULL);
- if (ret == 0)
- ret = __wt_log_fill(session, &myslot, false, record, &lsn);
- release_size = __wt_log_slot_release(&myslot, (int64_t)rdup_len);
- /*
- * If we get an error we still need to do proper accounting in
- * the slot fields.
- * XXX On error we may still need to call release and free.
- */
- if (ret != 0)
- myslot.slot->slot_error = ret;
- WT_ASSERT(session, ret == 0);
- if (WT_LOG_SLOT_DONE(release_size)) {
- WT_ERR(__wt_log_release(session, myslot.slot, &free_slot));
- if (free_slot)
- __wt_log_slot_free(session, myslot.slot);
- } else if (force) {
- /*
- * If we are going to wait for this slot to get written,
- * signal the wrlsn thread.
- *
- * XXX I've seen times when conditions are NULL.
- */
- if (conn->log_cond != NULL) {
- __wt_cond_signal(session, conn->log_cond);
- __wt_yield();
- } else
- WT_ERR(__wt_log_force_write(session, 1, NULL));
- }
- if (LF_ISSET(WT_LOG_FLUSH)) {
- /* Wait for our writes to reach the OS */
- while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 &&
- myslot.slot->slot_error == 0)
- __wt_cond_wait(
- session, log->log_write_cond, 10000, NULL);
- } else if (LF_ISSET(WT_LOG_FSYNC)) {
- /* Wait for our writes to reach disk */
- while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 &&
- myslot.slot->slot_error == 0)
- __wt_cond_wait(
- session, log->log_sync_cond, 10000, NULL);
- }
-
- /*
- * Advance the background sync LSN if needed.
- */
- if (LF_ISSET(WT_LOG_BACKGROUND))
- __wt_log_background(session, &lsn);
+ WT_STAT_CONN_INCR(session, log_writes);
+
+ /*
+ * The only time joining a slot should ever return an error is if it detects a panic.
+ */
+ __wt_log_slot_join(session, rdup_len, flags, &myslot);
+ /*
+ * If the addition of this record crosses the buffer boundary, switch in a new slot.
+ */
+ force = LF_ISSET(WT_LOG_FLUSH | WT_LOG_FSYNC);
+ ret = 0;
+ if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX || F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force)
+ ret = __wt_log_slot_switch(session, &myslot, true, false, NULL);
+ if (ret == 0)
+ ret = __wt_log_fill(session, &myslot, false, record, &lsn);
+ release_size = __wt_log_slot_release(&myslot, (int64_t)rdup_len);
+ /*
+ * If we get an error we still need to do proper accounting in the slot fields. XXX On error we
+ * may still need to call release and free.
+ */
+ if (ret != 0)
+ myslot.slot->slot_error = ret;
+ WT_ASSERT(session, ret == 0);
+ if (WT_LOG_SLOT_DONE(release_size)) {
+ WT_ERR(__wt_log_release(session, myslot.slot, &free_slot));
+ if (free_slot)
+ __wt_log_slot_free(session, myslot.slot);
+ } else if (force) {
+ /*
+ * If we are going to wait for this slot to get written,
+ * signal the wrlsn thread.
+ *
+ * XXX I've seen times when conditions are NULL.
+ */
+ if (conn->log_cond != NULL) {
+ __wt_cond_signal(session, conn->log_cond);
+ __wt_yield();
+ } else
+ WT_ERR(__wt_log_force_write(session, 1, NULL));
+ }
+ if (LF_ISSET(WT_LOG_FLUSH)) {
+ /* Wait for our writes to reach the OS */
+ while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0)
+ __wt_cond_wait(session, log->log_write_cond, 10000, NULL);
+ } else if (LF_ISSET(WT_LOG_FSYNC)) {
+ /* Wait for our writes to reach disk */
+ while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0)
+ __wt_cond_wait(session, log->log_sync_cond, 10000, NULL);
+ }
+
+ /*
+ * Advance the background sync LSN if needed.
+ */
+ if (LF_ISSET(WT_LOG_BACKGROUND))
+ __wt_log_background(session, &lsn);
err:
- if (ret == 0 && lsnp != NULL)
- *lsnp = lsn;
- /*
- * If we're synchronous and some thread had an error, we don't know
- * if our write made it out to the file or not. The error could be
- * before or after us. So, if anyone got an error, we report it.
- * If we're not synchronous, only report if our own operation got
- * an error.
- */
- if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 &&
- myslot.slot != NULL)
- ret = myslot.slot->slot_error;
-
- /*
- * If one of the sync flags is set, assert the proper LSN has moved to
- * match on success.
- */
- WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FLUSH) ||
- __wt_log_cmp(&log->write_lsn, &lsn) >= 0);
- WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FSYNC) ||
- __wt_log_cmp(&log->sync_lsn, &lsn) >= 0);
- return (ret);
+ if (ret == 0 && lsnp != NULL)
+ *lsnp = lsn;
+ /*
+ * If we're synchronous and some thread had an error, we don't know if our write made it out to
+ * the file or not. The error could be before or after us. So, if anyone got an error, we report
+ * it. If we're not synchronous, only report if our own operation got an error.
+ */
+ if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 && myslot.slot != NULL)
+ ret = myslot.slot->slot_error;
+
+ /*
+ * If one of the sync flags is set, assert the proper LSN has moved to match on success.
+ */
+ WT_ASSERT(
+ session, ret != 0 || !LF_ISSET(WT_LOG_FLUSH) || __wt_log_cmp(&log->write_lsn, &lsn) >= 0);
+ WT_ASSERT(
+ session, ret != 0 || !LF_ISSET(WT_LOG_FSYNC) || __wt_log_cmp(&log->sync_lsn, &lsn) >= 0);
+ return (ret);
}
/*
* __wt_log_vprintf --
- * Write a message into the log.
+ * Write a message into the log.
*/
int
__wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_ITEM(logrec);
- WT_DECL_RET;
- size_t header_size, len;
- uint32_t rectype;
- const char *rec_fmt;
- va_list ap_copy;
-
- conn = S2C(session);
- rectype = WT_LOGREC_MESSAGE;
- rec_fmt = WT_UNCHECKED_STRING(I);
-
- if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
- return (0);
-
- va_copy(ap_copy, ap);
- len = 1;
- ret = __wt_vsnprintf_len_incr(NULL, 0, &len, fmt, ap_copy);
- va_end(ap_copy);
- WT_RET(ret);
-
- WT_RET(
- __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec));
-
- /*
- * We're writing a record with the type (an integer) followed by a
- * string (NUL-terminated data). To avoid writing the string into
- * a buffer before copying it, we write the header first, then the
- * raw bytes of the string.
- */
- WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype));
- WT_ERR(__wt_struct_pack(session,
- (uint8_t *)logrec->data + logrec->size, header_size,
- rec_fmt, rectype));
- logrec->size += (uint32_t)header_size;
-
- WT_ERR(__wt_vsnprintf(
- (char *)logrec->data + logrec->size, len, fmt, ap));
-
- __wt_verbose(session, WT_VERB_LOG,
- "log_printf: %s", (char *)logrec->data + logrec->size);
-
- logrec->size += len;
- WT_ERR(__wt_log_write(session, logrec, NULL, 0));
-err: __wt_scr_free(session, &logrec);
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ size_t header_size, len;
+ uint32_t rectype;
+ const char *rec_fmt;
+ va_list ap_copy;
+
+ conn = S2C(session);
+ rectype = WT_LOGREC_MESSAGE;
+ rec_fmt = WT_UNCHECKED_STRING(I);
+
+ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
+ return (0);
+
+ va_copy(ap_copy, ap);
+ len = 1;
+ ret = __wt_vsnprintf_len_incr(NULL, 0, &len, fmt, ap_copy);
+ va_end(ap_copy);
+ WT_RET(ret);
+
+ WT_RET(__wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec));
+
+ /*
+ * We're writing a record with the type (an integer) followed by a string (NUL-terminated data).
+ * To avoid writing the string into a buffer before copying it, we write the header first, then
+ * the raw bytes of the string.
+ */
+ WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype));
+ WT_ERR(__wt_struct_pack(
+ session, (uint8_t *)logrec->data + logrec->size, header_size, rec_fmt, rectype));
+ logrec->size += (uint32_t)header_size;
+
+ WT_ERR(__wt_vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap));
+
+ __wt_verbose(session, WT_VERB_LOG, "log_printf: %s", (char *)logrec->data + logrec->size);
+
+ logrec->size += len;
+ WT_ERR(__wt_log_write(session, logrec, NULL, 0));
+err:
+ __wt_scr_free(session, &logrec);
+ return (ret);
}
/*
* __wt_log_flush --
- * Forcibly flush the log to the synchronization level specified.
- * Wait until it has been completed.
+ * Forcibly flush the log to the synchronization level specified. Wait until it has been
+ * completed.
*/
int
__wt_log_flush(WT_SESSION_IMPL *session, uint32_t flags)
{
- WT_CONNECTION_IMPL *conn;
- WT_LOG *log;
- WT_LSN last_lsn, lsn;
-
- conn = S2C(session);
- WT_ASSERT(session, FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED));
- log = conn->log;
- /*
- * We need to flush out the current slot first to get the real
- * end of log LSN in log->alloc_lsn.
- */
- WT_RET(__wt_log_flush_lsn(session, &lsn, false));
- last_lsn = log->alloc_lsn;
-
- /*
- * If the last write caused a switch to a new log file, we should only
- * wait for the last write to be flushed. Otherwise, if the workload
- * is single-threaded we could wait here forever because the write LSN
- * doesn't switch into the new file until it contains a record.
- */
- if (last_lsn.l.offset == log->first_record)
- last_lsn = log->log_close_lsn;
-
- /*
- * Wait until all current outstanding writes have been written
- * to the file system.
- */
- while (__wt_log_cmp(&last_lsn, &lsn) > 0) {
- __wt_sleep(0, WT_THOUSAND);
- WT_RET(__wt_log_flush_lsn(session, &lsn, false));
- }
-
- __wt_verbose(session, WT_VERB_LOG,
- "log_flush: flags %#" PRIx32 " LSN %" PRIu32 "/%" PRIu32,
- flags, lsn.l.file, lsn.l.offset);
- /*
- * If the user wants write-no-sync, there is nothing more to do.
- * If the user wants background sync, set the LSN and we're done.
- * If the user wants sync, force it now.
- */
- if (LF_ISSET(WT_LOG_BACKGROUND))
- __wt_log_background(session, &lsn);
- else if (LF_ISSET(WT_LOG_FSYNC))
- WT_RET(__wt_log_force_sync(session, &lsn));
- return (0);
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ WT_LSN last_lsn, lsn;
+
+ conn = S2C(session);
+ WT_ASSERT(session, FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED));
+ log = conn->log;
+ /*
+ * We need to flush out the current slot first to get the real end of log LSN in log->alloc_lsn.
+ */
+ WT_RET(__wt_log_flush_lsn(session, &lsn, false));
+ last_lsn = log->alloc_lsn;
+
+ /*
+ * If the last write caused a switch to a new log file, we should only wait for the last write
+ * to be flushed. Otherwise, if the workload is single-threaded we could wait here forever
+ * because the write LSN doesn't switch into the new file until it contains a record.
+ */
+ if (last_lsn.l.offset == log->first_record)
+ last_lsn = log->log_close_lsn;
+
+ /*
+ * Wait until all current outstanding writes have been written to the file system.
+ */
+ while (__wt_log_cmp(&last_lsn, &lsn) > 0) {
+ __wt_sleep(0, WT_THOUSAND);
+ WT_RET(__wt_log_flush_lsn(session, &lsn, false));
+ }
+
+ __wt_verbose(session, WT_VERB_LOG, "log_flush: flags %#" PRIx32 " LSN %" PRIu32 "/%" PRIu32,
+ flags, lsn.l.file, lsn.l.offset);
+ /*
+ * If the user wants write-no-sync, there is nothing more to do. If the user wants background
+ * sync, set the LSN and we're done. If the user wants sync, force it now.
+ */
+ if (LF_ISSET(WT_LOG_BACKGROUND))
+ __wt_log_background(session, &lsn);
+ else if (LF_ISSET(WT_LOG_FSYNC))
+ WT_RET(__wt_log_force_sync(session, &lsn));
+ return (0);
}