summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/log/log.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/log/log.c')
-rw-r--r--src/third_party/wiredtiger/src/log/log.c1243
1 files changed, 1243 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
new file mode 100644
index 00000000000..d13002cdc5a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -0,0 +1,1243 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_log_ckpt --
+ * Record the given LSN as the checkpoint LSN and signal the archive
+ * thread as needed.
+ */
+int
+__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+ log->ckpt_lsn = *ckp_lsn;
+ if (conn->arch_cond != NULL)
+ WT_RET(__wt_cond_signal(session, conn->arch_cond));
+ return (0);
+}
+
+/*
+ * __wt_log_written_reset --
+ * Interface to reset the amount of log written during this
+ * during this checkpoint period. Called from the checkpoint code.
+ */
+void
+__wt_log_written_reset(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ if (!conn->logging)
+ return;
+ log = conn->log;
+ log->log_written = 0;
+ return;
+}
+
+/*
+ * __wt_log_get_files --
+ * Retrieve the list of all existing log files.
+ */
+int
+__wt_log_get_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp)
+{
+ WT_CONNECTION_IMPL *conn;
+ const char *log_path;
+
+ *countp = 0;
+ *filesp = NULL;
+
+ conn = S2C(session);
+ log_path = conn->log_path;
+ if (log_path == NULL)
+ log_path = "";
+ return (__wt_dirlist(session, log_path, WT_LOG_FILENAME,
+ WT_DIRLIST_INCLUDE, filesp, countp));
+}
+
+/*
+ * __wt_log_get_active_files --
+ * Retrieve the list of active log files (those that are not candidates
+ * for archiving).
+ */
+int
+__wt_log_get_active_files(
+ WT_SESSION_IMPL *session, char ***filesp, u_int *countp)
+{
+ WT_DECL_RET;
+ WT_LOG *log;
+ char **files;
+ uint32_t id;
+ u_int count, i;
+
+ id = 0;
+ log = S2C(session)->log;
+
+ WT_RET(__wt_log_get_files(session, &files, &count));
+
+ /* Filter out any files that are below the checkpoint LSN. */
+ for (i = 0; i < count; ) {
+ WT_ERR(__wt_log_extract_lognum(session, files[i], &id));
+ if (id < log->ckpt_lsn.file) {
+ __wt_free(session, files[i]);
+ files[i] = files[count - 1];
+ files[--count] = NULL;
+ } else
+ i++;
+ }
+
+ *filesp = files;
+ *countp = count;
+
+ if (0) {
+err: __wt_log_files_free(session, files, count);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_log_files_free --
+ * Free memory associated with a log file list.
+ */
+void
+__wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count)
+{
+ u_int i;
+
+ for (i = 0; i < count; i++)
+ __wt_free(session, files[i]);
+ __wt_free(session, files);
+}
+
+/*
+ * __wt_log_filename --
+ * Given a log number, return a WT_ITEM of a generated log file name.
+ */
+int
+__wt_log_filename(WT_SESSION_IMPL *session, uint32_t id, WT_ITEM *buf)
+{
+ const char *log_path;
+
+ log_path = S2C(session)->log_path;
+
+ if (log_path != NULL && log_path[0] != '\0')
+ WT_RET(__wt_buf_fmt(session, buf, "%s/%s.%010" PRIu32,
+ log_path, WT_LOG_FILENAME, id));
+ else
+ WT_RET(__wt_buf_fmt(session, buf, "%s.%010" PRIu32,
+ WT_LOG_FILENAME, id));
+
+ return (0);
+}
+
+/*
+ * __wt_log_extract_lognum --
+ * Given a log file name, extract out the log number.
+ */
+int
+__wt_log_extract_lognum(
+ WT_SESSION_IMPL *session, const char *name, uint32_t *id)
+{
+ const char *p;
+
+ WT_UNUSED(session);
+
+ if (id == NULL || name == NULL)
+ return (WT_ERROR);
+ if ((p = strrchr(name, '.')) == NULL ||
+ sscanf(++p, "%" PRIu32, id) != 1)
+ WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name);
+ return (0);
+}
+
+/*
+ * __wt_log_remove --
+ * Given a log number, remove that log file.
+ */
+int
+__wt_log_remove(WT_SESSION_IMPL *session, uint32_t lognum)
+{
+ WT_DECL_ITEM(path);
+ WT_DECL_RET;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &path));
+ WT_ERR(__wt_log_filename(session, lognum, path));
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_remove: remove log %s", (char *)path->data));
+ WT_ERR(__wt_remove(session, path->data));
+err: __wt_scr_free(&path);
+ return (ret);
+}
+
+/*
+ * __log_openfile --
+ * Open a log file with the given log file number and return the WT_FH.
+ */
+static int
+__log_openfile(WT_SESSION_IMPL *session, int ok_create, WT_FH **fh, uint32_t id)
+{
+ WT_DECL_ITEM(path);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 0, &path));
+ WT_ERR(__wt_log_filename(session, id, path));
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "opening log %s", (const char *)path->data));
+ WT_ERR(__wt_open(
+ session, path->data, ok_create, 0, WT_FILE_TYPE_LOG, fh));
+err: __wt_scr_free(&path);
+ return (ret);
+}
+
+/*
+ * __wt_log_open --
+ * Open the appropriate log file for the connection. The purpose is
+ * to find the last log file that exists, open it and set our initial
+ * LSNs to the end of that file. If none exist, call __wt_log_newfile
+ * to create it.
+ */
+int
+__wt_log_open(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ uint32_t firstlog, lastlog, lognum;
+ u_int i, logcount;
+ char **logfiles;
+
+ conn = S2C(session);
+ log = conn->log;
+ lastlog = 0;
+ firstlog = UINT32_MAX;
+
+ WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ lastlog = WT_MAX(lastlog, lognum);
+ firstlog = WT_MIN(firstlog, lognum);
+ }
+ log->fileid = lastlog;
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_open: first log %d last log %d", firstlog, lastlog));
+ log->first_lsn.file = firstlog;
+ log->first_lsn.offset = 0;
+
+ /*
+ * Start logging at the beginning of the next log file, no matter
+ * where the previous log file ends.
+ */
+ WT_ERR(__wt_log_newfile(session, 1));
+
+ /*
+ * If there were log files, run recovery.
+ * XXX belongs at a higher level than this.
+ */
+ if (logcount > 0) {
+ log->trunc_lsn = log->alloc_lsn;
+ WT_ERR(__wt_txn_recover(conn));
+ }
+
+err: __wt_log_files_free(session, logfiles, logcount);
+ return (ret);
+}
+
+/*
+ * __wt_log_close --
+ * Close the log file.
+ */
+int
+__wt_log_close(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) {
+ WT_RET(__wt_verbose(session, WT_VERB_LOG,
+ "closing old log %s", log->log_close_fh->name));
+ WT_RET(__wt_close(session, log->log_close_fh));
+ }
+ if (log->log_fh != NULL) {
+ WT_RET(__wt_verbose(session, WT_VERB_LOG,
+ "closing log %s", log->log_fh->name));
+ WT_RET(__wt_close(session, log->log_fh));
+ log->log_fh = NULL;
+ }
+ return (0);
+}
+
+/*
+ * __log_fill --
+ * Copy a thread's log records into the assigned slot.
+ */
+static int
+__log_fill(WT_SESSION_IMPL *session,
+ WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp)
+{
+ WT_DECL_RET;
+ WT_LOG_RECORD *logrec;
+
+ logrec = (WT_LOG_RECORD *)record->mem;
+ /*
+ * Call __wt_write. For now the offset is the real byte offset.
+ * If the offset becomes a unit of LOG_ALIGN this is where we would
+ * multiply by LOG_ALIGN to get the real file byte offset for write().
+ */
+ if (direct)
+ WT_ERR(__wt_write(session, myslot->slot->slot_fh,
+ myslot->offset + myslot->slot->slot_start_offset,
+ (size_t)logrec->len, (void *)logrec));
+ else
+ memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
+ logrec, logrec->len);
+
+ WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len);
+ if (lsnp != NULL) {
+ *lsnp = myslot->slot->slot_start_lsn;
+ lsnp->offset += (wt_off_t)myslot->offset;
+ }
+err:
+ if (ret != 0 && myslot->slot->slot_error == 0)
+ myslot->slot->slot_error = ret;
+ return (ret);
+}
+
+/*
+ * __log_size_fit --
+ * Return whether or not recsize will fit in the log file.
+ */
+static int
+__log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ return (lsn->offset + (wt_off_t)recsize < conn->log_file_max);
+}
+
+/*
+ * __log_truncate --
+ * Truncate the log to the given LSN. If this_log is set, it will only
+ * truncate the log file indicated in the given LSN. If not set,
+ * it will truncate between the given LSN and the trunc_lsn. That is,
+ * since we pre-allocate log files, it will free that space and allow the
+ * log to be traversed. We use the trunc_lsn because logging has already
+ * opened the new/next log file before recovery ran. This function assumes
+ * we are in recovery or other dedicated time and not during live running.
+ */
+static int
+__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, uint32_t this_log)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *log_fh, *tmp_fh;
+ WT_LOG *log;
+ uint32_t lognum;
+ u_int i, logcount;
+ char **logfiles;
+
+ conn = S2C(session);
+ log = conn->log;
+ log_fh = NULL;
+ logcount = 0;
+ logfiles = NULL;
+
+ /*
+ * Truncate the log file to the given LSN.
+ */
+ WT_ERR(__log_openfile(session, 0, &log_fh, lsn->file));
+ WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset));
+ tmp_fh = log_fh;
+ log_fh = NULL;
+ WT_ERR(__wt_close(session, tmp_fh));
+
+ /*
+ * If we just want to truncate the current log, return and skip
+ * looking for intervening logs.
+ */
+ if (this_log)
+ goto err;
+ WT_ERR(__wt_log_get_files(session, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ if (lognum > lsn->file && lognum < log->trunc_lsn.file) {
+ WT_ERR(__log_openfile(session, 0, &log_fh, lognum));
+ /*
+ * If there are intervening files pre-allocated,
+ * truncate them to the end of the log file header.
+ */
+ WT_ERR(__wt_ftruncate(session,
+ log_fh, LOG_FIRST_RECORD));
+ tmp_fh = log_fh;
+ log_fh = NULL;
+ WT_ERR(__wt_close(session, tmp_fh));
+ }
+ }
+err: if (log_fh != NULL)
+ WT_TRET(__wt_close(session, log_fh));
+ if (logfiles != NULL)
+ __wt_log_files_free(session, logfiles, logcount);
+ return (ret);
+}
+
+/*
+ * __log_filesize --
+ * Returns an estimate of the real end of log file.
+ */
+static int
+__log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *eof)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ wt_off_t log_size, off, off1;
+ uint32_t allocsize, bufsz;
+ char *buf, *zerobuf;
+
+ conn = S2C(session);
+ log = conn->log;
+ if (eof == NULL)
+ return (0);
+ *eof = 0;
+ WT_RET(__wt_filesize(session, fh, &log_size));
+ if (log == NULL)
+ allocsize = LOG_ALIGN;
+ else
+ allocsize = log->allocsize;
+
+ /*
+ * It can be very slow looking for the last real record in the log
+ * in very small chunks. Walk backward by a megabyte at a time. When
+ * we find a part of the log that is not just zeroes, walk to find
+ * the last record.
+ */
+ buf = zerobuf = NULL;
+ if (allocsize < WT_MEGABYTE && log_size > WT_MEGABYTE)
+ bufsz = WT_MEGABYTE;
+ else
+ bufsz = allocsize;
+ WT_RET(__wt_calloc_def(session, bufsz, &buf));
+ WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf));
+
+ /*
+ * Read in a chunk starting at the end of the file. Keep going until
+ * we reach the beginning or we find a chunk that contains any non-zero
+ * bytes. Compare against a known zero byte chunk.
+ */
+ for (off = log_size - (wt_off_t)bufsz;
+ off >= 0;
+ off -= (wt_off_t)bufsz) {
+ WT_ERR(__wt_read(session, fh, off, bufsz, buf));
+ if (memcmp(buf, zerobuf, bufsz) != 0)
+ break;
+ }
+
+ /*
+ * If we're walking by large amounts, now walk by the real allocsize
+ * to find the real end, if we found something. Otherwise we reached
+ * the beginning of the file. Offset can go negative if the log file
+ * size is not a multiple of a megabyte. The first chunk of the log
+ * file will always be non-zero.
+ */
+ if (off < 0)
+ off = 0;
+
+ /*
+ * We know all log records are aligned at log->allocsize. The first
+ * item in a log record is always a 32-bit length. Look for any
+ * non-zero length at the allocsize boundary. This may not be a true
+ * log record since it could be the middle of a large record. But we
+ * know no log record starts after it. Return an estimate of the log
+ * file size.
+ */
+ for (off1 = bufsz - allocsize;
+ off1 > 0; off1 -= (wt_off_t)allocsize)
+ if (memcmp(buf + off1, zerobuf, sizeof(uint32_t)) != 0)
+ break;
+ off = off + off1;
+
+ /*
+ * Set EOF to the last zero-filled record we saw.
+ */
+ *eof = off + (wt_off_t)allocsize;
+err:
+ if (buf != NULL)
+ __wt_free(session, buf);
+ if (zerobuf != NULL)
+ __wt_free(session, zerobuf);
+ return (ret);
+}
+
+/*
+ * __log_acquire --
+ * Called with the log slot lock held. Can be called recursively
+ * from __wt_log_newfile when we change log files.
+ */
+static int
+__log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+ /*
+ * Called locked. Add recsize to alloc_lsn. Save our starting LSN
+ * where the previous allocation finished for the release LSN.
+ * That way when log files switch, we're waiting for the correct LSN
+ * from outstanding writes.
+ */
+ slot->slot_release_lsn = log->alloc_lsn;
+ if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
+ WT_RET(__wt_log_newfile(session, 0));
+ if (log->log_close_fh != NULL)
+ F_SET(slot, SLOT_CLOSEFH);
+ }
+ /*
+ * Checkpoints can be configured based on amount of log written.
+ * Add in this log record to the sum and if needed, signal the
+ * checkpoint condition. The logging subsystem manages the
+ * accumulated field. There is a bit of layering violation
+ * here checking the connection ckpt field and using its
+ * condition.
+ */
+ if (WT_CKPT_LOGSIZE(conn)) {
+ log->log_written += (wt_off_t)recsize;
+ WT_RET(__wt_checkpoint_signal(session, log->log_written));
+ }
+
+ /*
+ * Need to minimally fill in slot info here. Our slot start LSN
+ * comes after any potential new log file creations.
+ */
+ slot->slot_start_lsn = log->alloc_lsn;
+ slot->slot_start_offset = log->alloc_lsn.offset;
+ /*
+ * Pre-allocate on the first real write into the log file.
+ */
+ if (log->alloc_lsn.offset == LOG_FIRST_RECORD) {
+ if (!log->log_fh->fallocate_available ||
+ (ret = __wt_fallocate(session, log->log_fh,
+ LOG_FIRST_RECORD, conn->log_file_max)) == ENOTSUP)
+ ret = __wt_ftruncate(session, log->log_fh,
+ LOG_FIRST_RECORD + conn->log_file_max);
+ WT_RET(ret);
+ }
+
+ log->alloc_lsn.offset += (wt_off_t)recsize;
+ slot->slot_end_lsn = log->alloc_lsn;
+ slot->slot_error = 0;
+ slot->slot_fh = log->log_fh;
+ return (0);
+}
+
+/*
+ * __log_release --
+ * Release a log slot.
+ */
+static int
+__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *close_fh;
+ WT_LOG *log;
+ WT_LSN sync_lsn;
+ size_t write_size;
+ WT_DECL_SPINLOCK_ID(id); /* Must appear last */
+
+ conn = S2C(session);
+ log = conn->log;
+ /*
+ * If we're going to have to close our log file, make a local copy
+ * of the file handle structure.
+ */
+ close_fh = NULL;
+ if (F_ISSET(slot, SLOT_CLOSEFH)) {
+ close_fh = log->log_close_fh;
+ log->log_close_fh = NULL;
+ F_CLR(slot, SLOT_CLOSEFH);
+ }
+
+ /* Write the buffered records */
+ if (F_ISSET(slot, SLOT_BUFFERED)) {
+ write_size = (size_t)
+ (slot->slot_end_lsn.offset - slot->slot_start_offset);
+ WT_ERR(__wt_write(session, slot->slot_fh,
+ slot->slot_start_offset, write_size, slot->slot_buf.mem));
+ }
+
+ /*
+ * Wait for earlier groups to finish, otherwise there could be holes
+ * in the log file.
+ */
+ while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0)
+ __wt_yield();
+ log->write_lsn = slot->slot_end_lsn;
+ /*
+ * Try to consolidate calls to fsync to wait less. Acquire a spin lock
+ * so that threads finishing writing to the log will wait while the
+ * current fsync completes and advance log->write_lsn.
+ */
+ while (F_ISSET(slot, SLOT_SYNC) &&
+ LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+ if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) {
+ (void)__wt_cond_wait(
+ session, log->log_sync_cond, 10000);
+ continue;
+ }
+ /*
+ * Record the current end of log after we grabbed the lock.
+ * That is how far our fsync call with guarantee.
+ */
+ sync_lsn = log->write_lsn;
+ if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+ WT_STAT_FAST_CONN_INCR(session, log_sync);
+ ret = __wt_fsync(session, log->log_fh);
+ if (ret == 0) {
+ F_CLR(slot, SLOT_SYNC);
+ log->sync_lsn = sync_lsn;
+ ret = __wt_cond_signal(
+ session, log->log_sync_cond);
+ }
+ }
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ WT_ERR(ret);
+ }
+ if (F_ISSET(slot, SLOT_BUF_GROW)) {
+ WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
+ F_CLR(slot, SLOT_BUF_GROW);
+ WT_STAT_FAST_CONN_INCRV(session,
+ log_buffer_size, slot->slot_buf.memsize);
+ WT_ERR(__wt_buf_grow(session,
+ &slot->slot_buf, slot->slot_buf.memsize * 2));
+ }
+ /*
+ * If we have a file to close, close it now.
+ */
+ if (close_fh)
+ WT_ERR(__wt_close(session, close_fh));
+
+err: if (ret != 0 && slot->slot_error == 0)
+ slot->slot_error = ret;
+ return (ret);
+}
+
+/*
+ * __wt_log_newfile --
+ * Create the next log file and write the file header record into it.
+ */
+int
+__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOG_DESC *desc;
+ WT_LOG_RECORD *logrec;
+ WT_LOGSLOT tmp;
+ WT_MYSLOT myslot;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ /*
+ * Set aside the log file handle to be closed later. Other threads
+ * may still be using it to write to the log. If the log file size
+ * is small we could fill a log file before the previous one is closed.
+ * Wait for that to close.
+ */
+ while (log->log_close_fh != NULL) {
+ __wt_errx(session,
+ "log_newfile: Log file size %" PRIuMAX " too small",
+ (uintmax_t)conn->log_file_max);
+ WT_STAT_FAST_CONN_INCR(session, log_close_yields);
+ __wt_yield();
+ }
+ log->log_close_fh = log->log_fh;
+ log->fileid++;
+ WT_RET(__log_openfile(session, 1, &log->log_fh, log->fileid));
+ log->alloc_lsn.file = log->fileid;
+ log->alloc_lsn.offset = log->log_fh->size;
+
+ /*
+ * Set up the log descriptor record. Use a scratch buffer to
+ * get correct alignment for direct I/O.
+ */
+ WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize);
+ WT_RET(__wt_scr_alloc(session, log->allocsize, &buf));
+ memset(buf->mem, 0, log->allocsize);
+ logrec = (WT_LOG_RECORD *)buf->mem;
+ desc = (WT_LOG_DESC *)logrec->record;
+ desc->log_magic = WT_LOG_MAGIC;
+ desc->majorv = WT_LOG_MAJOR_VERSION;
+ desc->minorv = WT_LOG_MINOR_VERSION;
+ desc->log_size = (uint64_t)conn->log_file_max;
+
+ /*
+ * Now that the record is set up, initialize the record header.
+ */
+ logrec->len = log->allocsize;
+ logrec->checksum = 0;
+ logrec->checksum = __wt_cksum(logrec, log->allocsize);
+ WT_CLEAR(tmp);
+ myslot.slot = &tmp;
+ myslot.offset = 0;
+
+ /*
+ * Recursively call __log_acquire to allocate log space for the
+ * log descriptor record. Call __log_fill to write it, but we
+ * do not need to call __log_release because we're not waiting for
+ * earlier operations to complete.
+ */
+ WT_ERR(__log_acquire(session, logrec->len, &tmp));
+ WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));
+
+ /*
+ * If we're called from connection creation code, we need to update
+ * the LSNs since we're the only write in progress.
+ */
+ if (conn_create) {
+ WT_ERR(__wt_fsync(session, log->log_fh));
+ log->sync_lsn = tmp.slot_end_lsn;
+ log->write_lsn = tmp.slot_end_lsn;
+ }
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_log_read --
+ * Read the log record at the given LSN. Return the record (including
+ * the log header) in the WT_ITEM. Caller is responsible for freeing it.
+ */
+int
+__wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+ uint32_t flags)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *log_fh;
+ WT_LOG *log;
+ WT_LOG_RECORD *logrec;
+ uint32_t cksum, rdup_len, reclen;
+
+ WT_UNUSED(flags);
+ /*
+ * If the caller didn't give us an LSN or something to return,
+ * there's nothing to do.
+ */
+ if (lsnp == NULL || record == NULL)
+ return (0);
+ conn = S2C(session);
+ log = conn->log;
+ /*
+ * If the offset isn't on an allocation boundary it must be wrong.
+ */
+ if (lsnp->offset % log->allocsize != 0 || lsnp->file > log->fileid)
+ return (WT_NOTFOUND);
+
+ WT_RET(__log_openfile(session, 0, &log_fh, lsnp->file));
+ /*
+ * Read the minimum allocation size a record could be.
+ */
+ WT_ERR(__wt_buf_init(session, record, log->allocsize));
+ WT_ERR(__wt_read(session,
+ log_fh, lsnp->offset, (size_t)log->allocsize, record->mem));
+ /*
+ * First 4 bytes is the real record length. See if we
+ * need to read more than the allocation size. We expect
+ * that we rarely will have to read more. Most log records
+ * will be fairly small.
+ */
+ reclen = *(uint32_t *)record->mem;
+ if (reclen == 0) {
+ ret = WT_NOTFOUND;
+ goto err;
+ }
+ if (reclen > log->allocsize) {
+ rdup_len = __wt_rduppo2(reclen, log->allocsize);
+ WT_ERR(__wt_buf_grow(session, record, rdup_len));
+ WT_ERR(__wt_read(session,
+ log_fh, lsnp->offset, (size_t)rdup_len, record->mem));
+ }
+ /*
+ * We read in the record, verify checksum.
+ */
+ logrec = (WT_LOG_RECORD *)record->mem;
+ cksum = logrec->checksum;
+ logrec->checksum = 0;
+ logrec->checksum = __wt_cksum(logrec, logrec->len);
+ if (logrec->checksum != cksum)
+ WT_ERR_MSG(session, WT_ERROR, "log_read: Bad checksum");
+ record->size = logrec->len;
+ WT_STAT_FAST_CONN_INCR(session, log_reads);
+err:
+ WT_TRET(__wt_close(session, log_fh));
+ return (ret);
+}
+
+/*
+ * __wt_log_scan --
+ * Scan the logs, calling a function on each record found.
+ */
+int
+__wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
+ int (*func)(WT_SESSION_IMPL *session,
+ WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *log_fh;
+ WT_ITEM buf;
+ WT_LOG *log;
+ WT_LOG_RECORD *logrec;
+ WT_LSN end_lsn, rd_lsn, start_lsn;
+ wt_off_t log_size;
+ uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen;
+ u_int i, logcount;
+ int eol;
+ char **logfiles;
+
+ conn = S2C(session);
+ log = conn->log;
+ log_fh = NULL;
+ logcount = 0;
+ logfiles = NULL;
+ eol = 0;
+ WT_CLEAR(buf);
+
+ /*
+ * If the caller did not give us a callback function there is nothing
+ * to do.
+ */
+ if (func == NULL)
+ return (0);
+
+ if (LF_ISSET(WT_LOGSCAN_RECOVER))
+ WT_RET(__wt_verbose(session, WT_VERB_LOG,
+ "__wt_log_scan truncating to %u/%" PRIuMAX,
+ log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset));
+
+ if (log != NULL) {
+ allocsize = log->allocsize;
+
+ if (lsnp == NULL) {
+ if (LF_ISSET(WT_LOGSCAN_FIRST))
+ start_lsn = log->first_lsn;
+ else if (LF_ISSET(WT_LOGSCAN_FROM_CKP))
+ start_lsn = log->ckpt_lsn;
+ else
+ return (WT_ERROR); /* Illegal usage */
+ } else {
+ if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP))
+ WT_RET_MSG(session, WT_ERROR,
+ "choose either a start LSN or a start flag");
+
+ /* Offsets must be on allocation boundaries. */
+ if (lsnp->offset % allocsize != 0 ||
+ lsnp->file > log->fileid)
+ return (WT_NOTFOUND);
+
+ /*
+ * Log cursors may not know the starting LSN. If an
+ * LSN pointer is passed in, but it is the INIT_LSN,
+ * start from the first_lsn.
+ */
+ start_lsn = *lsnp;
+ if (IS_INIT_LSN(&start_lsn))
+ start_lsn = log->first_lsn;
+ }
+ end_lsn = log->alloc_lsn;
+ } else {
+ /*
+ * If logging is not configured, we can still print out the log
+ * if log files exist. We just need to set the LSNs from what
+ * is in the files versus what is in the live connection.
+ */
+ /*
+ * Set allocsize to the minimum alignment it could be. Larger
+ * records and larger allocation boundaries should always be
+ * a multiple of this.
+ */
+ allocsize = LOG_ALIGN;
+ lastlog = 0;
+ firstlog = UINT32_MAX;
+ WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
+ if (logcount == 0)
+ /*
+ * Return it is not supported if none don't exist.
+ */
+ return (ENOTSUP);
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i],
+ &lognum));
+ lastlog = WT_MAX(lastlog, lognum);
+ firstlog = WT_MIN(firstlog, lognum);
+ }
+ start_lsn.file = firstlog;
+ end_lsn.file = lastlog;
+ start_lsn.offset = end_lsn.offset = 0;
+ __wt_log_files_free(session, logfiles, logcount);
+ logfiles = NULL;
+ }
+ WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file));
+ WT_ERR(__log_filesize(session, log_fh, &log_size));
+ rd_lsn = start_lsn;
+ WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN));
+ for (;;) {
+ if (rd_lsn.offset + allocsize > log_size) {
+advance:
+ /*
+ * If we read the last record, go to the next file.
+ */
+ WT_ERR(__wt_close(session, log_fh));
+ log_fh = NULL;
+ eol = 1;
+ /*
+ * Truncate this log file before we move to the next.
+ */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER))
+ WT_ERR(__log_truncate(session, &rd_lsn, 1));
+ rd_lsn.file++;
+ rd_lsn.offset = 0;
+ /*
+ * Avoid an error message when we reach end of log
+ * by checking here.
+ */
+ if (rd_lsn.file > end_lsn.file)
+ break;
+ WT_ERR(__log_openfile(
+ session, 0, &log_fh, rd_lsn.file));
+ WT_ERR(__log_filesize(session, log_fh, &log_size));
+ continue;
+ }
+ /*
+ * Read the minimum allocation size a record could be.
+ */
+ WT_ASSERT(session, buf.memsize >= allocsize);
+ WT_ERR(__wt_read(session,
+ log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem));
+ /*
+ * First 8 bytes is the real record length. See if we
+ * need to read more than the allocation size. We expect
+ * that we rarely will have to read more. Most log records
+ * will be fairly small.
+ */
+ reclen = *(uint32_t *)buf.mem;
+ /*
+ * Log files are pre-allocated. We never expect a zero length
+ * unless we've reached the end of the log. The log can be
+ * written out of order, so when recovery finds the end of
+ * the log, truncate the file and remove any later log files
+ * that may exist.
+ */
+ if (reclen == 0) {
+ /* This LSN is the end. */
+ break;
+ }
+ rdup_len = __wt_rduppo2(reclen, allocsize);
+ if (reclen > allocsize) {
+ /*
+ * The log file end could be the middle of this
+ * log record.
+ */
+ if (rd_lsn.offset + rdup_len > log_size)
+ goto advance;
+ /*
+ * We need to round up and read in the full padded
+ * record, especially for direct I/O.
+ */
+ WT_ERR(__wt_buf_grow(session, &buf, rdup_len));
+ WT_ERR(__wt_read(session,
+ log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem));
+ WT_STAT_FAST_CONN_INCR(session, log_scan_rereads);
+ }
+ /*
+ * We read in the record, verify checksum.
+ */
+ buf.size = reclen;
+ logrec = (WT_LOG_RECORD *)buf.mem;
+ cksum = logrec->checksum;
+ logrec->checksum = 0;
+ logrec->checksum = __wt_cksum(logrec, logrec->len);
+ if (logrec->checksum != cksum) {
+ /*
+ * A checksum mismatch means we have reached the end of
+ * the useful part of the log. This should be found on
+ * the first pass through recovery. In the second pass
+ * where we truncate the log, this is where it should
+ * end.
+ */
+ if (log != NULL)
+ log->trunc_lsn = rd_lsn;
+ break;
+ }
+
+ /*
+ * We have a valid log record. If it is not the log file
+ * header, invoke the callback.
+ */
+ WT_STAT_FAST_CONN_INCR(session, log_scan_records);
+ if (rd_lsn.offset != 0) {
+ WT_ERR((*func)(session, &buf, &rd_lsn, cookie));
+ if (LF_ISSET(WT_LOGSCAN_ONE))
+ break;
+ }
+ rd_lsn.offset += (wt_off_t)rdup_len;
+ }
+
+ /* Truncate if we're in recovery. */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
+ LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
+ WT_ERR(__log_truncate(session, &rd_lsn, 0));
+
+err: WT_STAT_FAST_CONN_INCR(session, log_scans);
+ if (logfiles != NULL)
+ __wt_log_files_free(session, logfiles, logcount);
+ __wt_buf_free(session, &buf);
+ /*
+ * If the caller wants one record and it is at the end of log,
+ * return WT_NOTFOUND.
+ */
+ if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0)
+ ret = WT_NOTFOUND;
+ if (ret == ENOENT)
+ ret = 0;
+ if (log_fh != NULL)
+ WT_TRET(__wt_close(session, log_fh));
+ return (ret);
+}
+
+/*
+ * __log_direct_write --
+ * Write a log record without using the consolidation arrays.
+ */
+static int
+__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+ uint32_t flags)
+{
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOGSLOT tmp;
+ WT_MYSLOT myslot;
+ int locked;
+ WT_DECL_SPINLOCK_ID(id); /* Must appear last */
+
+ log = S2C(session)->log;
+ myslot.slot = &tmp;
+ myslot.offset = 0;
+ WT_CLEAR(tmp);
+
+ /* Fast path the contended case. */
+ if (__wt_spin_trylock(session, &log->log_slot_lock, &id) != 0)
+ return (EAGAIN);
+ locked = 1;
+
+ if (LF_ISSET(WT_LOG_FSYNC))
+ F_SET(&tmp, SLOT_SYNC);
+ WT_ERR(__log_acquire(session, record->size, &tmp));
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ locked = 0;
+ WT_ERR(__log_fill(session, &myslot, 1, record, lsnp));
+ WT_ERR(__log_release(session, &tmp));
+
+err: if (locked)
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ return (ret);
+}
+
+/*
+ * __wt_log_write --
+ * Write a record into the log.
+ */
+int
+__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+ uint32_t flags)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOG_RECORD *logrec;
+ WT_LSN lsn;
+ WT_MYSLOT myslot;
+ uint32_t rdup_len;
+ int locked;
+
+ conn = S2C(session);
+ log = conn->log;
+ locked = 0;
+ INIT_LSN(&lsn);
+ myslot.slot = NULL;
+ /*
+ * Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has
+ * a header at the beginning for us to fill in.
+ *
+ * If using direct_io, the caller should pass us an aligned record.
+ * But we need to make sure it is big enough and zero-filled so
+ * that we can write the full amount. Do this whether or not
+ * direct_io is in use because it makes the reading code cleaner.
+ */
+ WT_STAT_FAST_CONN_INCRV(session, log_bytes_user, record->size);
+ rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize);
+ WT_ERR(__wt_buf_grow(session, record, rdup_len));
+ WT_ASSERT(session, record->data == record->mem);
+ /*
+ * If the caller's record only partially fills the necessary
+ * space, we need to zero-fill the remainder.
+ */
+ if (record->size != rdup_len) {
+ memset((uint8_t *)record->mem + record->size, 0,
+ rdup_len - record->size);
+ record->size = rdup_len;
+ }
+ logrec = (WT_LOG_RECORD *)record->mem;
+ logrec->len = (uint32_t)record->size;
+ logrec->checksum = 0;
+ logrec->checksum = __wt_cksum(logrec, record->size);
+
+ WT_STAT_FAST_CONN_INCR(session, log_writes);
+
+ if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) {
+ ret = __log_direct_write(session, record, lsnp, flags);
+ if (ret == 0)
+ return (0);
+ if (ret != EAGAIN)
+ WT_ERR(ret);
+ /*
+ * An EAGAIN return means we failed to get the try lock -
+ * fall through to the consolidation code in that case.
+ */
+ }
+
+ /*
+ * As soon as we see contention for the log slot, disable direct
+ * log writes. We get better performance by forcing writes through
+ * the consolidation code. This is because individual writes flood
+ * the I/O system faster than they contend on the log slot lock.
+ */
+ F_SET(log, WT_LOG_FORCE_CONSOLIDATE);
+ if ((ret = __wt_log_slot_join(
+ session, rdup_len, flags, &myslot)) == ENOMEM) {
+ /*
+ * If we couldn't find a consolidated slot for this record
+ * write the record directly.
+ */
+ while ((ret = __log_direct_write(
+ session, record, lsnp, flags)) == EAGAIN)
+ ;
+ WT_ERR(ret);
+ /*
+ * Increase the buffer size of any slots we can get access
+ * to, so future consolidations are likely to succeed.
+ */
+ WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len));
+ return (0);
+ }
+ WT_ERR(ret);
+ if (myslot.offset == 0) {
+ __wt_spin_lock(session, &log->log_slot_lock);
+ locked = 1;
+ WT_ERR(__wt_log_slot_close(session, myslot.slot));
+ WT_ERR(__log_acquire(
+ session, myslot.slot->slot_group_size, myslot.slot));
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ locked = 0;
+ WT_ERR(__wt_log_slot_notify(session, myslot.slot));
+ } else
+ WT_ERR(__wt_log_slot_wait(session, myslot.slot));
+ WT_ERR(__log_fill(session, &myslot, 0, record, &lsn));
+ if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) {
+ WT_ERR(__log_release(session, myslot.slot));
+ WT_ERR(__wt_log_slot_free(myslot.slot));
+ } else if (LF_ISSET(WT_LOG_FSYNC)) {
+ /* Wait for our writes to reach disk */
+ while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
+ myslot.slot->slot_error == 0)
+ (void)__wt_cond_wait(
+ session, log->log_sync_cond, 10000);
+ }
+err:
+ if (locked)
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ if (ret == 0 && lsnp != NULL)
+ *lsnp = lsn;
+ /*
+ * If we're synchronous and some thread had an error, we don't know
+ * if our write made it out to the file or not. The error could be
+ * before or after us. So, if anyone got an error, we report it.
+ * If we're not synchronous, only report if our own operation got
+ * an error.
+ */
+ if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 &&
+ myslot.slot != NULL)
+ ret = myslot.slot->slot_error;
+ return (ret);
+}
+
+/*
+ * __wt_log_vprintf --
+ * Write a message into the log.
+ */
+int
+__wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ va_list ap_copy;
+ const char *rec_fmt = WT_UNCHECKED_STRING(I);
+ uint32_t rectype = WT_LOGREC_MESSAGE;
+ size_t header_size, len;
+
+ conn = S2C(session);
+
+ if (!conn->logging)
+ return (0);
+
+ va_copy(ap_copy, ap);
+ len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + 1;
+ va_end(ap_copy);
+
+ WT_RET(
+ __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec));
+
+ /*
+ * We're writing a record with the type (an integer) followed by a
+ * string (NUL-terminated data). To avoid writing the string into
+ * a buffer before copying it, we write the header first, then the
+ * raw bytes of the string.
+ */
+ WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype));
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, header_size,
+ rec_fmt, rectype));
+ logrec->size += (uint32_t)header_size;
+
+ (void)vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_printf: %s", (char *)logrec->data + logrec->size));
+
+ logrec->size += len;
+ WT_ERR(__wt_log_write(session, logrec, NULL, 0));
+err: __wt_scr_free(&logrec);
+ return (ret);
+}