1 files changed, 1243 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
new file mode 100644
index 00000000000..d13002cdc5a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -0,0 +1,1243 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_log_ckpt --
+ *	Record the given LSN as the checkpoint LSN and signal the archive
+ *	thread as needed.
+ */
+int
+__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+
+	conn = S2C(session);
+	log = conn->log;
+	log->ckpt_lsn = *ckp_lsn;
+	if (conn->arch_cond != NULL)
+		WT_RET(__wt_cond_signal(session, conn->arch_cond));
+	return (0);
+}
+
+/*
+ * __wt_log_written_reset --
+ *	Interface to reset the amount of log written during this
+ *	during this checkpoint period.  Called from the checkpoint code.
+ */
+void
+__wt_log_written_reset(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+
+	conn = S2C(session);
+	if (!conn->logging)
+		return;
+	log = conn->log;
+	log->log_written = 0;
+	return;
+}
+
+/*
+ * __wt_log_get_files --
+ *	Retrieve the list of all existing log files.
+ */
+int
+__wt_log_get_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp)
+{
+	WT_CONNECTION_IMPL *conn;
+	const char *log_path;
+
+	*countp = 0;
+	*filesp = NULL;
+
+	conn = S2C(session);
+	log_path = conn->log_path;
+	if (log_path == NULL)
+		log_path = "";
+	return (__wt_dirlist(session, log_path, WT_LOG_FILENAME,
+	    WT_DIRLIST_INCLUDE, filesp, countp));
+}
+
+/*
+ * __wt_log_get_active_files --
+ *	Retrieve the list of active log files (those that are not candidates
+ *	for archiving).
+ */
+int
+__wt_log_get_active_files(
+	WT_SESSION_IMPL *session, char ***filesp, u_int *countp)
+{
+	WT_DECL_RET;
+	WT_LOG *log;
+	char **files;
+	uint32_t id;
+	u_int count, i;
+
+	id = 0;
+	log = S2C(session)->log;
+
+	WT_RET(__wt_log_get_files(session, &files, &count));
+
+	/* Filter out any files that are below the checkpoint LSN. */
+	for (i = 0; i < count; ) {
+		WT_ERR(__wt_log_extract_lognum(session, files[i], &id));
+		if (id < log->ckpt_lsn.file) {
+			__wt_free(session, files[i]);
+			files[i] = files[count - 1];
+			files[--count] = NULL;
+		} else
+			i++;
+	}
+
+	*filesp = files;
+	*countp = count;
+
+	if (0) {
+err:		__wt_log_files_free(session, files, count);
+	}
+	return (ret);
+}
+
+/*
+ * __wt_log_files_free --
+ *	Free memory associated with a log file list.
+ */
+void
+__wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count)
+{
+	u_int i;
+
+	for (i = 0; i < count; i++)
+		__wt_free(session, files[i]);
+	__wt_free(session, files);
+}
+
+/*
+ * __wt_log_filename --
+ *	Given a log number, return a WT_ITEM of a generated log file name.
+ */
+int
+__wt_log_filename(WT_SESSION_IMPL *session, uint32_t id, WT_ITEM *buf)
+{
+	const char *log_path;
+
+	log_path = S2C(session)->log_path;
+
+	if (log_path != NULL && log_path[0] != '\0')
+		WT_RET(__wt_buf_fmt(session, buf, "%s/%s.%010" PRIu32,
+		    log_path, WT_LOG_FILENAME, id));
+	else
+		WT_RET(__wt_buf_fmt(session, buf, "%s.%010" PRIu32,
+		    WT_LOG_FILENAME, id));
+
+	return (0);
+}
+
+/*
+ * __wt_log_extract_lognum --
+ *	Given a log file name, extract out the log number.
+ */
+int
+__wt_log_extract_lognum(
+    WT_SESSION_IMPL *session, const char *name, uint32_t *id)
+{
+	const char *p;
+
+	WT_UNUSED(session);
+
+	if (id == NULL || name == NULL)
+		return (WT_ERROR);
+	if ((p = strrchr(name, '.')) == NULL ||
+	    sscanf(++p, "%" PRIu32, id) != 1)
+		WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name);
+	return (0);
+}
+
+/*
+ * __wt_log_remove --
+ *	Given a log number, remove that log file.
+ */
+int
+__wt_log_remove(WT_SESSION_IMPL *session, uint32_t lognum)
+{
+	WT_DECL_ITEM(path);
+	WT_DECL_RET;
+
+	WT_ERR(__wt_scr_alloc(session, 0, &path));
+	WT_ERR(__wt_log_filename(session, lognum, path));
+	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+	    "log_remove: remove log %s", (char *)path->data));
+	WT_ERR(__wt_remove(session, path->data));
+err:	__wt_scr_free(&path);
+	return (ret);
+}
+
+/*
+ * __log_openfile --
+ *	Open a log file with the given log file number and return the WT_FH.
+ */
+static int
+__log_openfile(WT_SESSION_IMPL *session, int ok_create, WT_FH **fh, uint32_t id)
+{
+	WT_DECL_ITEM(path);
+	WT_DECL_RET;
+
+	WT_RET(__wt_scr_alloc(session, 0, &path));
+	WT_ERR(__wt_log_filename(session, id, path));
+	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+	    "opening log %s", (const char *)path->data));
+	WT_ERR(__wt_open(
+	    session, path->data, ok_create, 0, WT_FILE_TYPE_LOG, fh));
+err:	__wt_scr_free(&path);
+	return (ret);
+}
+
+/*
+ * __wt_log_open --
+ *	Open the appropriate log file for the connection.  The purpose is
+ *	to find the last log file that exists, open it and set our initial
+ *	LSNs to the end of that file.  If none exist, call __wt_log_newfile
+ *	to create it.
+ */
+int
+__wt_log_open(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+	uint32_t firstlog, lastlog, lognum;
+	u_int i, logcount;
+	char **logfiles;
+
+	conn = S2C(session);
+	log = conn->log;
+	lastlog = 0;
+	firstlog = UINT32_MAX;
+
+	WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
+	for (i = 0; i < logcount; i++) {
+		WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+		lastlog = WT_MAX(lastlog, lognum);
+		firstlog = WT_MIN(firstlog, lognum);
+	}
+	log->fileid = lastlog;
+	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+	    "log_open: first log %d last log %d", firstlog, lastlog));
+	log->first_lsn.file = firstlog;
+	log->first_lsn.offset = 0;
+
+	/*
+	 * Start logging at the beginning of the next log file, no matter
+	 * where the previous log file ends.
+	 */
+	WT_ERR(__wt_log_newfile(session, 1));
+
+	/*
+	 * If there were log files, run recovery.
+	 * XXX belongs at a higher level than this.
+	 */
+	if (logcount > 0) {
+		log->trunc_lsn = log->alloc_lsn;
+		WT_ERR(__wt_txn_recover(conn));
+	}
+
+err:	__wt_log_files_free(session, logfiles, logcount);
+	return (ret);
+}
+
+/*
+ * __wt_log_close --
+ *	Close the log file.
+ */
+int
+__wt_log_close(WT_SESSION_IMPL *session)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_LOG *log;
+
+	conn = S2C(session);
+	log = conn->log;
+
+	if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) {
+		WT_RET(__wt_verbose(session, WT_VERB_LOG,
+		    "closing old log %s", log->log_close_fh->name));
+		WT_RET(__wt_close(session, log->log_close_fh));
+	}
+	if (log->log_fh != NULL) {
+		WT_RET(__wt_verbose(session, WT_VERB_LOG,
+		    "closing log %s", log->log_fh->name));
+		WT_RET(__wt_close(session, log->log_fh));
+		log->log_fh = NULL;
+	}
+	return (0);
+}
+
+/*
+ * __log_fill --
+ *	Copy a thread's log records into the assigned slot.
+ */
+static int
+__log_fill(WT_SESSION_IMPL *session,
+    WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp)
+{
+	WT_DECL_RET;
+	WT_LOG_RECORD *logrec;
+
+	logrec = (WT_LOG_RECORD *)record->mem;
+	/*
+	 * Call __wt_write.  For now the offset is the real byte offset.
+	 * If the offset becomes a unit of LOG_ALIGN this is where we would
+	 * multiply by LOG_ALIGN to get the real file byte offset for write().
+	 */
+	if (direct)
+		WT_ERR(__wt_write(session, myslot->slot->slot_fh,
+		    myslot->offset + myslot->slot->slot_start_offset,
+		    (size_t)logrec->len, (void *)logrec));
+	else
+		memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
+		    logrec, logrec->len);
+
+	WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len);
+	if (lsnp != NULL) {
+		*lsnp = myslot->slot->slot_start_lsn;
+		lsnp->offset += (wt_off_t)myslot->offset;
+	}
+err:
+	if (ret != 0 && myslot->slot->slot_error == 0)
+		myslot->slot->slot_error = ret;
+	return (ret);
+}
+
+/*
+ * __log_size_fit --
+ *	Return whether or not recsize will fit in the log file.
+ */
+static int
+__log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize)
+{
+	WT_CONNECTION_IMPL *conn;
+
+	conn = S2C(session);
+	return (lsn->offset + (wt_off_t)recsize < conn->log_file_max);
+}
+
+/*
+ * __log_truncate --
+ *	Truncate the log to the given LSN.  If this_log is set, it will only
+ *	truncate the log file indicated in the given LSN.  If not set,
+ *	it will truncate between the given LSN and the trunc_lsn.  That is,
+ *	since we pre-allocate log files, it will free that space and allow the
+ *	log to be traversed.  We use the trunc_lsn because logging has already
+ *	opened the new/next log file before recovery ran.  This function assumes
+ *	we are in recovery or other dedicated time and not during live running.
+ */
+static int
+__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, uint32_t this_log)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_FH *log_fh, *tmp_fh;
+	WT_LOG *log;
+	uint32_t lognum;
+	u_int i, logcount;
+	char **logfiles;
+
+	conn = S2C(session);
+	log = conn->log;
+	log_fh = NULL;
+	logcount = 0;
+	logfiles = NULL;
+
+	/*
+	 * Truncate the log file to the given LSN.
+	 */
+	WT_ERR(__log_openfile(session, 0, &log_fh, lsn->file));
+	WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset));
+	tmp_fh = log_fh;
+	log_fh = NULL;
+	WT_ERR(__wt_close(session, tmp_fh));
+
+	/*
+	 * If we just want to truncate the current log, return and skip
+	 * looking for intervening logs.
+	 */
+	if (this_log)
+		goto err;
+	WT_ERR(__wt_log_get_files(session, &logfiles, &logcount));
+	for (i = 0; i < logcount; i++) {
+		WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+		if (lognum > lsn->file && lognum < log->trunc_lsn.file) {
+			WT_ERR(__log_openfile(session, 0, &log_fh, lognum));
+			/*
+			 * If there are intervening files pre-allocated,
+			 * truncate them to the end of the log file header.
+			 */
+			WT_ERR(__wt_ftruncate(session,
+			    log_fh, LOG_FIRST_RECORD));
+			tmp_fh = log_fh;
+			log_fh = NULL;
+			WT_ERR(__wt_close(session, tmp_fh));
+		}
+	}
+err:	if (log_fh != NULL)
+		WT_TRET(__wt_close(session, log_fh));
+	if (logfiles != NULL)
+		__wt_log_files_free(session, logfiles, logcount);
+	return (ret);
+}
+
+/*
+ * __log_filesize --
+ *	Returns an estimate of the real end of log file.
+ */
+static int
+__log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *eof)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+	wt_off_t log_size, off, off1;
+	uint32_t allocsize, bufsz;
+	char *buf, *zerobuf;
+
+	conn = S2C(session);
+	log = conn->log;
+	if (eof == NULL)
+		return (0);
+	*eof = 0;
+	WT_RET(__wt_filesize(session, fh, &log_size));
+	if (log == NULL)
+		allocsize = LOG_ALIGN;
+	else
+		allocsize = log->allocsize;
+
+	/*
+	 * It can be very slow looking for the last real record in the log
+	 * in very small chunks.  Walk backward by a megabyte at a time.  When
+	 * we find a part of the log that is not just zeroes, walk to find
+	 * the last record.
+	 */
+	buf = zerobuf = NULL;
+	if (allocsize < WT_MEGABYTE && log_size > WT_MEGABYTE)
+		bufsz = WT_MEGABYTE;
+	else
+		bufsz = allocsize;
+	WT_RET(__wt_calloc_def(session, bufsz, &buf));
+	WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf));
+
+	/*
+	 * Read in a chunk starting at the end of the file.  Keep going until
+	 * we reach the beginning or we find a chunk that contains any non-zero
+	 * bytes.  Compare against a known zero byte chunk.
+	 */
+	for (off = log_size - (wt_off_t)bufsz;
+	    off >= 0;
+	    off -= (wt_off_t)bufsz) {
+		WT_ERR(__wt_read(session, fh, off, bufsz, buf));
+		if (memcmp(buf, zerobuf, bufsz) != 0)
+			break;
+	}
+
+	/*
+	 * If we're walking by large amounts, now walk by the real allocsize
+	 * to find the real end, if we found something.  Otherwise we reached
+	 * the beginning of the file.  Offset can go negative if the log file
+	 * size is not a multiple of a megabyte.  The first chunk of the log
+	 * file will always be non-zero.
+	 */
+	if (off < 0)
+		off = 0;
+
+	/*
+	 * We know all log records are aligned at log->allocsize.  The first
+	 * item in a log record is always a 32-bit length.  Look for any
+	 * non-zero length at the allocsize boundary.  This may not be a true
+	 * log record since it could be the middle of a large record.  But we
+	 * know no log record starts after it.  Return an estimate of the log
+	 * file size.
+	 */
+	for (off1 = bufsz - allocsize;
+	    off1 > 0; off1 -= (wt_off_t)allocsize)
+		if (memcmp(buf + off1, zerobuf, sizeof(uint32_t)) != 0)
+			break;
+	off = off + off1;
+
+	/*
+	 * Set EOF to the last zero-filled record we saw.
+	 */
+	*eof = off + (wt_off_t)allocsize;
+err:
+	if (buf != NULL)
+		__wt_free(session, buf);
+	if (zerobuf != NULL)
+		__wt_free(session, zerobuf);
+	return (ret);
+}
+
+/*
+ * __log_acquire --
+ *	Called with the log slot lock held.  Can be called recursively
+ *	from __wt_log_newfile when we change log files.
+ */
+static int
+__log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+
+	conn = S2C(session);
+	log = conn->log;
+	/*
+	 * Called locked.  Add recsize to alloc_lsn.  Save our starting LSN
+	 * where the previous allocation finished for the release LSN.
+	 * That way when log files switch, we're waiting for the correct LSN
+	 * from outstanding writes.
+	 */
+	slot->slot_release_lsn = log->alloc_lsn;
+	if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
+		WT_RET(__wt_log_newfile(session, 0));
+		if (log->log_close_fh != NULL)
+			F_SET(slot, SLOT_CLOSEFH);
+	}
+	/*
+	 * Checkpoints can be configured based on amount of log written.
+	 * Add in this log record to the sum and if needed, signal the
+	 * checkpoint condition.  The logging subsystem manages the
+	 * accumulated field.  There is a bit of layering violation
+	 * here checking the connection ckpt field and using its
+	 * condition.
+	 */
+	if (WT_CKPT_LOGSIZE(conn)) {
+		log->log_written += (wt_off_t)recsize;
+		WT_RET(__wt_checkpoint_signal(session, log->log_written));
+	}
+
+	/*
+	 * Need to minimally fill in slot info here.  Our slot start LSN
+	 * comes after any potential new log file creations.
+	 */
+	slot->slot_start_lsn = log->alloc_lsn;
+	slot->slot_start_offset = log->alloc_lsn.offset;
+	/*
+	 * Pre-allocate on the first real write into the log file.
+	 */
+	if (log->alloc_lsn.offset == LOG_FIRST_RECORD) {
+		if (!log->log_fh->fallocate_available ||
+		    (ret = __wt_fallocate(session, log->log_fh,
+		    LOG_FIRST_RECORD, conn->log_file_max)) == ENOTSUP)
+			ret = __wt_ftruncate(session, log->log_fh,
+			    LOG_FIRST_RECORD + conn->log_file_max);
+		WT_RET(ret);
+	}
+
+	log->alloc_lsn.offset += (wt_off_t)recsize;
+	slot->slot_end_lsn = log->alloc_lsn;
+	slot->slot_error = 0;
+	slot->slot_fh = log->log_fh;
+	return (0);
+}
+
+/*
+ * __log_release --
+ *	Release a log slot.
+ */
+static int
+__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_FH *close_fh;
+	WT_LOG *log;
+	WT_LSN sync_lsn;
+	size_t write_size;
+	WT_DECL_SPINLOCK_ID(id);			/* Must appear last */
+
+	conn = S2C(session);
+	log = conn->log;
+	/*
+	 * If we're going to have to close our log file, make a local copy
+	 * of the file handle structure.
+	 */
+	close_fh = NULL;
+	if (F_ISSET(slot, SLOT_CLOSEFH)) {
+		close_fh = log->log_close_fh;
+		log->log_close_fh = NULL;
+		F_CLR(slot, SLOT_CLOSEFH);
+	}
+
+	/* Write the buffered records */
+	if (F_ISSET(slot, SLOT_BUFFERED)) {
+		write_size = (size_t)
+		    (slot->slot_end_lsn.offset - slot->slot_start_offset);
+		WT_ERR(__wt_write(session, slot->slot_fh,
+		    slot->slot_start_offset, write_size, slot->slot_buf.mem));
+	}
+
+	/*
+	 * Wait for earlier groups to finish, otherwise there could be holes
+	 * in the log file.
+	 */
+	while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0)
+		__wt_yield();
+	log->write_lsn = slot->slot_end_lsn;
+	/*
+	 * Try to consolidate calls to fsync to wait less.  Acquire a spin lock
+	 * so that threads finishing writing to the log will wait while the
+	 * current fsync completes and advance log->write_lsn.
+	 */
+	while (F_ISSET(slot, SLOT_SYNC) &&
+	    LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+		if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) {
+			(void)__wt_cond_wait(
+			    session, log->log_sync_cond, 10000);
+			continue;
+		}
+		/*
+		 * Record the current end of log after we grabbed the lock.
+		 * That is how far our fsync call with guarantee.
+		 */
+		sync_lsn = log->write_lsn;
+		if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+			WT_STAT_FAST_CONN_INCR(session, log_sync);
+			ret = __wt_fsync(session, log->log_fh);
+			if (ret == 0) {
+				F_CLR(slot, SLOT_SYNC);
+				log->sync_lsn = sync_lsn;
+				ret = __wt_cond_signal(
+				    session, log->log_sync_cond);
+			}
+		}
+		__wt_spin_unlock(session, &log->log_sync_lock);
+		WT_ERR(ret);
+	}
+	if (F_ISSET(slot, SLOT_BUF_GROW)) {
+		WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
+		F_CLR(slot, SLOT_BUF_GROW);
+		WT_STAT_FAST_CONN_INCRV(session,
+		    log_buffer_size, slot->slot_buf.memsize);
+		WT_ERR(__wt_buf_grow(session,
+		    &slot->slot_buf, slot->slot_buf.memsize * 2));
+	}
+	/*
+	 * If we have a file to close, close it now.
+	 */
+	if (close_fh)
+		WT_ERR(__wt_close(session, close_fh));
+
+err:	if (ret != 0 && slot->slot_error == 0)
+		slot->slot_error = ret;
+	return (ret);
+}
+
+/*
+ * __wt_log_newfile --
+ *	Create the next log file and write the file header record into it.
+ */
+int
+__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_ITEM(buf);
+	WT_DECL_RET;
+	WT_LOG *log;
+	WT_LOG_DESC *desc;
+	WT_LOG_RECORD *logrec;
+	WT_LOGSLOT tmp;
+	WT_MYSLOT myslot;
+
+	conn = S2C(session);
+	log = conn->log;
+
+	/*
+	 * Set aside the log file handle to be closed later.  Other threads
+	 * may still be using it to write to the log.  If the log file size
+	 * is small we could fill a log file before the previous one is closed.
+	 * Wait for that to close.
+	 */
+	while (log->log_close_fh != NULL) {
+		__wt_errx(session,
+		    "log_newfile: Log file size %" PRIuMAX " too small",
+		    (uintmax_t)conn->log_file_max);
+		WT_STAT_FAST_CONN_INCR(session, log_close_yields);
+		__wt_yield();
+	}
+	log->log_close_fh = log->log_fh;
+	log->fileid++;
+	WT_RET(__log_openfile(session, 1, &log->log_fh, log->fileid));
+	log->alloc_lsn.file = log->fileid;
+	log->alloc_lsn.offset = log->log_fh->size;
+
+	/*
+	 * Set up the log descriptor record.  Use a scratch buffer to
+	 * get correct alignment for direct I/O.
+	 */
+	WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize);
+	WT_RET(__wt_scr_alloc(session, log->allocsize, &buf));
+	memset(buf->mem, 0, log->allocsize);
+	logrec = (WT_LOG_RECORD *)buf->mem;
+	desc = (WT_LOG_DESC *)logrec->record;
+	desc->log_magic = WT_LOG_MAGIC;
+	desc->majorv = WT_LOG_MAJOR_VERSION;
+	desc->minorv = WT_LOG_MINOR_VERSION;
+	desc->log_size = (uint64_t)conn->log_file_max;
+
+	/*
+	 * Now that the record is set up, initialize the record header.
+	 */
+	logrec->len = log->allocsize;
+	logrec->checksum = 0;
+	logrec->checksum = __wt_cksum(logrec, log->allocsize);
+	WT_CLEAR(tmp);
+	myslot.slot = &tmp;
+	myslot.offset = 0;
+
+	/*
+	 * Recursively call __log_acquire to allocate log space for the
+	 * log descriptor record.  Call __log_fill to write it, but we
+	 * do not need to call __log_release because we're not waiting for
+	 * earlier operations to complete.
+	 */
+	WT_ERR(__log_acquire(session, logrec->len, &tmp));
+	WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));
+
+	/*
+	 * If we're called from connection creation code, we need to update
+	 * the LSNs since we're the only write in progress.
+	 */
+	if (conn_create) {
+		WT_ERR(__wt_fsync(session, log->log_fh));
+		log->sync_lsn = tmp.slot_end_lsn;
+		log->write_lsn = tmp.slot_end_lsn;
+	}
+
+err:	__wt_scr_free(&buf);
+	return (ret);
+}
+
+/*
+ * __wt_log_read --
+ *	Read the log record at the given LSN.  Return the record (including
+ *	the log header) in the WT_ITEM.  Caller is responsible for freeing it.
+ */
+int
+__wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+    uint32_t flags)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_FH *log_fh;
+	WT_LOG *log;
+	WT_LOG_RECORD *logrec;
+	uint32_t cksum, rdup_len, reclen;
+
+	WT_UNUSED(flags);
+	/*
+	 * If the caller didn't give us an LSN or something to return,
+	 * there's nothing to do.
+	 */
+	if (lsnp == NULL || record == NULL)
+		return (0);
+	conn = S2C(session);
+	log = conn->log;
+	/*
+	 * If the offset isn't on an allocation boundary it must be wrong.
+	 */
+	if (lsnp->offset % log->allocsize != 0 || lsnp->file > log->fileid)
+		return (WT_NOTFOUND);
+
+	WT_RET(__log_openfile(session, 0, &log_fh, lsnp->file));
+	/*
+	 * Read the minimum allocation size a record could be.
+	 */
+	WT_ERR(__wt_buf_init(session, record, log->allocsize));
+	WT_ERR(__wt_read(session,
+	    log_fh, lsnp->offset, (size_t)log->allocsize, record->mem));
+	/*
+	 * First 4 bytes is the real record length.  See if we
+	 * need to read more than the allocation size.  We expect
+	 * that we rarely will have to read more.  Most log records
+	 * will be fairly small.
+	 */
+	reclen = *(uint32_t *)record->mem;
+	if (reclen == 0) {
+		ret = WT_NOTFOUND;
+		goto err;
+	}
+	if (reclen > log->allocsize) {
+		rdup_len = __wt_rduppo2(reclen, log->allocsize);
+		WT_ERR(__wt_buf_grow(session, record, rdup_len));
+		WT_ERR(__wt_read(session,
+		    log_fh, lsnp->offset, (size_t)rdup_len, record->mem));
+	}
+	/*
+	 * We read in the record, verify checksum.
+	 */
+	logrec = (WT_LOG_RECORD *)record->mem;
+	cksum = logrec->checksum;
+	logrec->checksum = 0;
+	logrec->checksum = __wt_cksum(logrec, logrec->len);
+	if (logrec->checksum != cksum)
+		WT_ERR_MSG(session, WT_ERROR, "log_read: Bad checksum");
+	record->size = logrec->len;
+	WT_STAT_FAST_CONN_INCR(session, log_reads);
+err:
+	WT_TRET(__wt_close(session, log_fh));
+	return (ret);
+}
+
+/*
+ * __wt_log_scan --
+ *	Scan the logs, calling a function on each record found.
+ */
+int
+__wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
+    int (*func)(WT_SESSION_IMPL *session,
+    WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_FH *log_fh;
+	WT_ITEM buf;
+	WT_LOG *log;
+	WT_LOG_RECORD *logrec;
+	WT_LSN end_lsn, rd_lsn, start_lsn;
+	wt_off_t log_size;
+	uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen;
+	u_int i, logcount;
+	int eol;
+	char **logfiles;
+
+	conn = S2C(session);
+	log = conn->log;
+	log_fh = NULL;
+	logcount = 0;
+	logfiles = NULL;
+	eol = 0;
+	WT_CLEAR(buf);
+
+	/*
+	 * If the caller did not give us a callback function there is nothing
+	 * to do.
+	 */
+	if (func == NULL)
+		return (0);
+
+	if (LF_ISSET(WT_LOGSCAN_RECOVER))
+		WT_RET(__wt_verbose(session, WT_VERB_LOG,
+		    "__wt_log_scan truncating to %u/%" PRIuMAX,
+		    log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset));
+
+	if (log != NULL) {
+		allocsize = log->allocsize;
+
+		if (lsnp == NULL) {
+			if (LF_ISSET(WT_LOGSCAN_FIRST))
+				start_lsn = log->first_lsn;
+			else if (LF_ISSET(WT_LOGSCAN_FROM_CKP))
+				start_lsn = log->ckpt_lsn;
+			else
+				return (WT_ERROR);	/* Illegal usage */
+		} else {
+			if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP))
+				WT_RET_MSG(session, WT_ERROR,
+			    "choose either a start LSN or a start flag");
+
+			/* Offsets must be on allocation boundaries. */
+			if (lsnp->offset % allocsize != 0 ||
+			    lsnp->file > log->fileid)
+				return (WT_NOTFOUND);
+
+			/*
+			 * Log cursors may not know the starting LSN.  If an
+			 * LSN pointer is passed in, but it is the INIT_LSN,
+			 * start from the first_lsn.
+			 */
+			start_lsn = *lsnp;
+			if (IS_INIT_LSN(&start_lsn))
+				start_lsn = log->first_lsn;
+		}
+		end_lsn = log->alloc_lsn;
+	} else {
+		/*
+		 * If logging is not configured, we can still print out the log
+		 * if log files exist.  We just need to set the LSNs from what
+		 * is in the files versus what is in the live connection.
+		 */
+		/*
+		 * Set allocsize to the minimum alignment it could be.  Larger
+		 * records and larger allocation boundaries should always be
+		 * a multiple of this.
+		 */
+		allocsize = LOG_ALIGN;
+		lastlog = 0;
+		firstlog = UINT32_MAX;
+		WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
+		if (logcount == 0)
+			/*
+			 * Return it is not supported if none don't exist.
+			 */
+			return (ENOTSUP);
+		for (i = 0; i < logcount; i++) {
+			WT_ERR(__wt_log_extract_lognum(session, logfiles[i],
+			    &lognum));
+			lastlog = WT_MAX(lastlog, lognum);
+			firstlog = WT_MIN(firstlog, lognum);
+		}
+		start_lsn.file = firstlog;
+		end_lsn.file = lastlog;
+		start_lsn.offset = end_lsn.offset = 0;
+		__wt_log_files_free(session, logfiles, logcount);
+		logfiles = NULL;
+	}
+	WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file));
+	WT_ERR(__log_filesize(session, log_fh, &log_size));
+	rd_lsn = start_lsn;
+	WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN));
+	for (;;) {
+		if (rd_lsn.offset + allocsize > log_size) {
+advance:
+			/*
+			 * If we read the last record, go to the next file.
+			 */
+			WT_ERR(__wt_close(session, log_fh));
+			log_fh = NULL;
+			eol = 1;
+			/*
+			 * Truncate this log file before we move to the next.
+			 */
+			if (LF_ISSET(WT_LOGSCAN_RECOVER))
+				WT_ERR(__log_truncate(session, &rd_lsn, 1));
+			rd_lsn.file++;
+			rd_lsn.offset = 0;
+			/*
+			 * Avoid an error message when we reach end of log
+			 * by checking here.
+			 */
+			if (rd_lsn.file > end_lsn.file)
+				break;
+			WT_ERR(__log_openfile(
+			    session, 0, &log_fh, rd_lsn.file));
+			WT_ERR(__log_filesize(session, log_fh, &log_size));
+			continue;
+		}
+		/*
+		 * Read the minimum allocation size a record could be.
+		 */
+		WT_ASSERT(session, buf.memsize >= allocsize);
+		WT_ERR(__wt_read(session,
+		    log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem));
+		/*
+		 * First 8 bytes is the real record length.  See if we
+		 * need to read more than the allocation size.  We expect
+		 * that we rarely will have to read more.  Most log records
+		 * will be fairly small.
+		 */
+		reclen = *(uint32_t *)buf.mem;
+		/*
+		 * Log files are pre-allocated.  We never expect a zero length
+		 * unless we've reached the end of the log.  The log can be
+		 * written out of order, so when recovery finds the end of
+		 * the log, truncate the file and remove any later log files
+		 * that may exist.
+		 */
+		if (reclen == 0) {
+			/* This LSN is the end. */
+			break;
+		}
+		rdup_len = __wt_rduppo2(reclen, allocsize);
+		if (reclen > allocsize) {
+			/*
+			 * The log file end could be the middle of this
+			 * log record.
+			 */
+			if (rd_lsn.offset + rdup_len > log_size)
+				goto advance;
+			/*
+			 * We need to round up and read in the full padded
+			 * record, especially for direct I/O.
+			 */
+			WT_ERR(__wt_buf_grow(session, &buf, rdup_len));
+			WT_ERR(__wt_read(session,
+			    log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem));
+			WT_STAT_FAST_CONN_INCR(session, log_scan_rereads);
+		}
+		/*
+		 * We read in the record, verify checksum.
+		 */
+		buf.size = reclen;
+		logrec = (WT_LOG_RECORD *)buf.mem;
+		cksum = logrec->checksum;
+		logrec->checksum = 0;
+		logrec->checksum = __wt_cksum(logrec, logrec->len);
+		if (logrec->checksum != cksum) {
+			/*
+			 * A checksum mismatch means we have reached the end of
+			 * the useful part of the log.  This should be found on
+			 * the first pass through recovery.  In the second pass
+			 * where we truncate the log, this is where it should
+			 * end.
+			 */
+			if (log != NULL)
+				log->trunc_lsn = rd_lsn;
+			break;
+		}
+
+		/*
+		 * We have a valid log record.  If it is not the log file
+		 * header, invoke the callback.
+		 */
+		WT_STAT_FAST_CONN_INCR(session, log_scan_records);
+		if (rd_lsn.offset != 0) {
+			WT_ERR((*func)(session, &buf, &rd_lsn, cookie));
+			if (LF_ISSET(WT_LOGSCAN_ONE))
+				break;
+		}
+		rd_lsn.offset += (wt_off_t)rdup_len;
+	}
+
+	/* Truncate if we're in recovery. */
+	if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
+	    LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
+		WT_ERR(__log_truncate(session, &rd_lsn, 0));
+
+err:	WT_STAT_FAST_CONN_INCR(session, log_scans);
+	if (logfiles != NULL)
+		__wt_log_files_free(session, logfiles, logcount);
+	__wt_buf_free(session, &buf);
+	/*
+	 * If the caller wants one record and it is at the end of log,
+	 * return WT_NOTFOUND.
+	 */
+	if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0)
+		ret = WT_NOTFOUND;
+	if (ret == ENOENT)
+		ret = 0;
+	if (log_fh != NULL)
+		WT_TRET(__wt_close(session, log_fh));
+	return (ret);
+}
+
+/*
+ * __log_direct_write --
+ *	Write a log record without using the consolidation arrays.
+ */
+static int
+__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+    uint32_t flags)
+{
+	WT_DECL_RET;
+	WT_LOG *log;
+	WT_LOGSLOT tmp;
+	WT_MYSLOT myslot;
+	int locked;
+	WT_DECL_SPINLOCK_ID(id);			/* Must appear last */
+
+	log = S2C(session)->log;
+	myslot.slot = &tmp;
+	myslot.offset = 0;
+	WT_CLEAR(tmp);
+
+	/* Fast path the contended case. */
+	if (__wt_spin_trylock(session, &log->log_slot_lock, &id) != 0)
+		return (EAGAIN);
+	locked = 1;
+
+	if (LF_ISSET(WT_LOG_FSYNC))
+		F_SET(&tmp, SLOT_SYNC);
+	WT_ERR(__log_acquire(session, record->size, &tmp));
+	__wt_spin_unlock(session, &log->log_slot_lock);
+	locked = 0;
+	WT_ERR(__log_fill(session, &myslot, 1, record, lsnp));
+	WT_ERR(__log_release(session, &tmp));
+
+err:	if (locked)
+		__wt_spin_unlock(session, &log->log_slot_lock);
+	return (ret);
+}
+
+/*
+ * __wt_log_write --
+ *	Write a record into the log.
+ */
+int
+__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+    uint32_t flags)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+	WT_LOG *log;
+	WT_LOG_RECORD *logrec;
+	WT_LSN lsn;
+	WT_MYSLOT myslot;
+	uint32_t rdup_len;
+	int locked;
+
+	conn = S2C(session);
+	log = conn->log;
+	locked = 0;
+	INIT_LSN(&lsn);
+	myslot.slot = NULL;
+	/*
+	 * Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has
+	 * a header at the beginning for us to fill in.
+	 *
+	 * If using direct_io, the caller should pass us an aligned record.
+	 * But we need to make sure it is big enough and zero-filled so
+	 * that we can write the full amount.  Do this whether or not
+	 * direct_io is in use because it makes the reading code cleaner.
+	 */
+	WT_STAT_FAST_CONN_INCRV(session, log_bytes_user, record->size);
+	rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize);
+	WT_ERR(__wt_buf_grow(session, record, rdup_len));
+	WT_ASSERT(session, record->data == record->mem);
+	/*
+	 * If the caller's record only partially fills the necessary
+	 * space, we need to zero-fill the remainder.
+	 */
+	if (record->size != rdup_len) {
+		memset((uint8_t *)record->mem + record->size, 0,
+		    rdup_len - record->size);
+		record->size = rdup_len;
+	}
+	logrec = (WT_LOG_RECORD *)record->mem;
+	logrec->len = (uint32_t)record->size;
+	logrec->checksum = 0;
+	logrec->checksum = __wt_cksum(logrec, record->size);
+
+	WT_STAT_FAST_CONN_INCR(session, log_writes);
+
+	if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) {
+		ret = __log_direct_write(session, record, lsnp, flags);
+		if (ret == 0)
+			return (0);
+		if (ret != EAGAIN)
+			WT_ERR(ret);
+		/*
+		 * An EAGAIN return means we failed to get the try lock -
+		 * fall through to the consolidation code in that case.
+		 */
+	}
+
+	/*
+	 * As soon as we see contention for the log slot, disable direct
+	 * log writes. We get better performance by forcing writes through
+	 * the consolidation code. This is because individual writes flood
+	 * the I/O system faster than they contend on the log slot lock.
+	 */
+	F_SET(log, WT_LOG_FORCE_CONSOLIDATE);
+	if ((ret = __wt_log_slot_join(
+	    session, rdup_len, flags, &myslot)) == ENOMEM) {
+		/*
+		 * If we couldn't find a consolidated slot for this record
+		 * write the record directly.
+		 */
+		while ((ret = __log_direct_write(
+		    session, record, lsnp, flags)) == EAGAIN)
+			;
+		WT_ERR(ret);
+		/*
+		 * Increase the buffer size of any slots we can get access
+		 * to, so future consolidations are likely to succeed.
+		 */
+		WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len));
+		return (0);
+	}
+	WT_ERR(ret);
+	if (myslot.offset == 0) {
+		__wt_spin_lock(session, &log->log_slot_lock);
+		locked = 1;
+		WT_ERR(__wt_log_slot_close(session, myslot.slot));
+		WT_ERR(__log_acquire(
+		    session, myslot.slot->slot_group_size, myslot.slot));
+		__wt_spin_unlock(session, &log->log_slot_lock);
+		locked = 0;
+		WT_ERR(__wt_log_slot_notify(session, myslot.slot));
+	} else
+		WT_ERR(__wt_log_slot_wait(session, myslot.slot));
+	WT_ERR(__log_fill(session, &myslot, 0, record, &lsn));
+	if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) {
+		WT_ERR(__log_release(session, myslot.slot));
+		WT_ERR(__wt_log_slot_free(myslot.slot));
+	} else if (LF_ISSET(WT_LOG_FSYNC)) {
+		/* Wait for our writes to reach disk */
+		while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
+		    myslot.slot->slot_error == 0)
+			(void)__wt_cond_wait(
+			    session, log->log_sync_cond, 10000);
+	}
+err:
+	if (locked)
+		__wt_spin_unlock(session, &log->log_slot_lock);
+	if (ret == 0 && lsnp != NULL)
+		*lsnp = lsn;
+	/*
+	 * If we're synchronous and some thread had an error, we don't know
+	 * if our write made it out to the file or not.  The error could be
+	 * before or after us.  So, if anyone got an error, we report it.
+	 * If we're not synchronous, only report if our own operation got
+	 * an error.
+	 */
+	if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 &&
+	    myslot.slot != NULL)
+		ret = myslot.slot->slot_error;
+	return (ret);
+}
+
+/*
+ * __wt_log_vprintf --
+ *	Write a message into the log.
+ */
+int
+__wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
+{
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_ITEM(logrec);
+	WT_DECL_RET;
+	va_list ap_copy;
+	const char *rec_fmt = WT_UNCHECKED_STRING(I);
+	uint32_t rectype = WT_LOGREC_MESSAGE;
+	size_t header_size, len;
+
+	conn = S2C(session);
+
+	if (!conn->logging)
+		return (0);
+
+	va_copy(ap_copy, ap);
+	len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + 1;
+	va_end(ap_copy);
+
+	WT_RET(
+	    __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec));
+
+	/*
+	 * We're writing a record with the type (an integer) followed by a
+	 * string (NUL-terminated data).  To avoid writing the string into
+	 * a buffer before copying it, we write the header first, then the
+	 * raw bytes of the string.
+	 */
+	WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype));
+	WT_ERR(__wt_struct_pack(session,
+	    (uint8_t *)logrec->data + logrec->size, header_size,
+	    rec_fmt, rectype));
+	logrec->size += (uint32_t)header_size;
+
+	(void)vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap);
+
+	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+	    "log_printf: %s", (char *)logrec->data + logrec->size));
+
+	logrec->size += len;
+	WT_ERR(__wt_log_write(session, logrec, NULL, 0));
+err:	__wt_scr_free(&logrec);
+	return (ret);
+}