/*-
 * Copyright (c) 2014-2016 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

static int __log_write_internal(
	WT_SESSION_IMPL *, WT_ITEM *, WT_LSN *, uint32_t);

#define	WT_LOG_COMPRESS_SKIP	(offsetof(WT_LOG_RECORD, record))
#define	WT_LOG_ENCRYPT_SKIP	(offsetof(WT_LOG_RECORD, record))

/*
 * __wt_log_ckpt --
 *	Record the given LSN as the checkpoint LSN and signal the archive
 *	thread as needed.
 */
int
__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;

	conn = S2C(session);
	log = conn->log;
	log->ckpt_lsn = *ckp_lsn;
	if (conn->log_cond != NULL)
		WT_RET(__wt_cond_signal(session, conn->log_cond));
	return (0);
}

/*
 * __wt_log_flush_lsn --
 *	Force out buffered records and return the LSN, either the
 *	write_start_lsn or write_lsn depending on the argument.
 */
int
__wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;

	conn = S2C(session);
	log = conn->log;
	WT_RET(__wt_log_force_write(session, 1));
	WT_RET(__wt_log_wrlsn(session, NULL));
	if (start)
		*lsn = log->write_start_lsn;
	else
		*lsn = log->write_lsn;
	return (0);
}

/*
 * __wt_log_background --
 *	Record the given LSN as the background LSN and signal the
 *	thread as needed.
 */
int
__wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;

	conn = S2C(session);
	log = conn->log;
	/*
	 * If a thread already set the LSN to a bigger LSN, we're done.
	 */
	if (__wt_log_cmp(&session->bg_sync_lsn, lsn) > 0)
		return (0);
	session->bg_sync_lsn = *lsn;

	/*
	 * Advance the logging subsystem background sync LSN if
	 * needed.
	 */
	__wt_spin_lock(session, &log->log_sync_lock);
	if (__wt_log_cmp(lsn, &log->bg_sync_lsn) > 0)
		log->bg_sync_lsn = *lsn;
	__wt_spin_unlock(session, &log->log_sync_lock);
	return (__wt_cond_signal(session, conn->log_file_cond));
}

/*
 * __wt_log_force_sync --
 *	Force a sync of the log and files.
 */
int
__wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
{
	WT_LOG *log;
	WT_DECL_RET;

	log = S2C(session)->log;

	/*
	 * We need to wait for the previous log file to get written
	 * to disk before we sync out the current one and advance
	 * the LSN.  Signal the worker thread because we know the
	 * LSN has moved into a later log file and there should be a
	 * log file ready to close.
	 */
	while (log->sync_lsn.file < min_lsn->file) {
		WT_ERR(__wt_cond_signal(session,
		    S2C(session)->log_file_cond));
		WT_ERR(__wt_cond_wait(session, log->log_sync_cond, 10000));
	}
	__wt_spin_lock(session, &log->log_sync_lock);
	WT_ASSERT(session, log->log_dir_fh != NULL);
	/*
	 * Sync the directory if the log file entry hasn't been written
	 * into the directory.
	 */
	if (log->sync_dir_lsn.file < min_lsn->file) {
		WT_ERR(__wt_verbose(session, WT_VERB_LOG,
		    "log_force_sync: sync directory %s to LSN %d/%lu",
		    log->log_dir_fh->name, min_lsn->file, min_lsn->offset));
		WT_ERR(__wt_directory_sync_fh(session, log->log_dir_fh));
		log->sync_dir_lsn = *min_lsn;
		WT_STAT_FAST_CONN_INCR(session, log_sync_dir);
	}
	/*
	 * Sync the log file if needed.
	 */
	if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) {
		WT_ERR(__wt_verbose(session, WT_VERB_LOG,
		    "log_force_sync: sync %s to LSN %d/%lu",
		    log->log_fh->name, min_lsn->file, min_lsn->offset));
		WT_ERR(__wt_fsync(session, log->log_fh));
		log->sync_lsn = *min_lsn;
		WT_STAT_FAST_CONN_INCR(session, log_sync);
		WT_ERR(__wt_cond_signal(session, log->log_sync_cond));
	}
err:
	__wt_spin_unlock(session, &log->log_sync_lock);
	return (ret);
}

/*
 * __wt_log_needs_recovery --
 *	Return 0 if we encounter a clean shutdown and 1 if recovery
 *	must be run in the given variable.
 */
int
__wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *c;
	WT_DECL_RET;
	WT_ITEM dummy_key, dummy_value;
	WT_LOG *log;
	uint64_t dummy_txnid;
	uint32_t dummy_fileid, dummy_optype, rectype;

	conn = S2C(session);
	log = conn->log;

	/*
	 * Default is to run recovery always (regardless of whether this
	 * connection has logging enabled).
	 */
	*recp = true;
	if (log == NULL)
		return (0);

	/*
	 * See if there are any data modification records between the
	 * checkpoint LSN and the end of the log.  If there are none then
	 * we can skip recovery.
	 */
	WT_RET(__wt_curlog_open(session, "log:", NULL, &c));
	c->set_key(c, ckp_lsn->file, ckp_lsn->offset, 0);
	if ((ret = c->search(c)) == 0) {
		while ((ret = c->next(c)) == 0) {
			/*
			 * The only thing we care about is the rectype.
			 */
			WT_ERR(c->get_value(c, &dummy_txnid, &rectype,
			    &dummy_optype, &dummy_fileid,
			    &dummy_key, &dummy_value));
			if (rectype == WT_LOGREC_COMMIT)
				break;
		}
		/*
		 * If we get to the end of the log, we can skip recovery.
		 */
		if (ret == WT_NOTFOUND) {
			*recp = false;
			ret = 0;
		}
	} else if (ret == WT_NOTFOUND)
		/*
		 * We should always find the checkpoint LSN as it now points
		 * to the beginning of a written log record.  But if we're
		 * running recovery on an earlier database we may not.  In
		 * that case, we need to run recovery, don't return an error.
		 */
		ret = 0;
	else
		WT_ERR(ret);

err:	WT_TRET(c->close(c));
	return (ret);
}

/*
 * __wt_log_written_reset --
 *	Interface to reset the amount of log written during this
 *	checkpoint period.  Called from the checkpoint code.
 */
void
__wt_log_written_reset(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;

	conn = S2C(session);
	if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
		return;
	log = conn->log;
	log->log_written = 0;
	return;
}

/*
 * __log_get_files --
 *	Retrieve the list of all log-related files of the given prefix type.
 */
static int
__log_get_files(WT_SESSION_IMPL *session,
    const char *file_prefix, char ***filesp, u_int *countp)
{
	WT_CONNECTION_IMPL *conn;
	const char *log_path;

	*countp = 0;
	*filesp = NULL;

	conn = S2C(session);
	log_path = conn->log_path;
	if (log_path == NULL)
		log_path = "";
	return (__wt_dirlist(session, log_path, file_prefix,
	    WT_DIRLIST_INCLUDE, filesp, countp));
}

/*
 * __wt_log_get_all_files --
 *	Retrieve the list of log files, either all of them or only the active
 *	ones (those that are not candidates for archiving).
 */
int
__wt_log_get_all_files(WT_SESSION_IMPL *session,
    char ***filesp, u_int *countp, uint32_t *maxid, bool active_only)
{
	WT_DECL_RET;
	WT_LOG *log;
	char **files;
	uint32_t id, max;
	u_int count, i;

	id = 0;
	log = S2C(session)->log;

	*maxid = 0;
	/*
	 * These may be files needed by backup.  Force the current slot
	 * to get written to the file.
	 */
	WT_RET(__wt_log_force_write(session, 1));
	WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count));

	/* Filter out any files that are below the checkpoint LSN. */
	for (max = 0, i = 0; i < count; ) {
		WT_ERR(__wt_log_extract_lognum(session, files[i], &id));
		if (active_only && id < log->ckpt_lsn.file) {
			__wt_free(session, files[i]);
			files[i] = files[count - 1];
			files[--count] = NULL;
		} else {
			if (id > max)
				max = id;
			i++;
		}
	}

	*maxid = max;
	*filesp = files;
	*countp = count;

	if (0) {
err:		__wt_log_files_free(session, files, count);
	}
	return (ret);
}

/*
 * __wt_log_files_free --
 *	Free memory associated with a log file list.
 */
void
__wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count)
{
	u_int i;

	for (i = 0; i < count; i++)
		__wt_free(session, files[i]);
	__wt_free(session, files);
}

/*
 * __log_filename --
 *	Given a log number, return a WT_ITEM of a generated log file name
 *	of the given prefix type.
 */
static int
__log_filename(WT_SESSION_IMPL *session,
    uint32_t id, const char *file_prefix, WT_ITEM *buf)
{
	const char *log_path;

	log_path = S2C(session)->log_path;

	if (log_path != NULL && log_path[0] != '\0')
		WT_RET(__wt_buf_fmt(session, buf, "%s/%s.%010" PRIu32,
		    log_path, file_prefix, id));
	else
		WT_RET(__wt_buf_fmt(session, buf, "%s.%010" PRIu32,
		    file_prefix, id));

	return (0);
}

/*
 * __wt_log_extract_lognum --
 *	Given a log file name, extract out the log number.
 */
int
__wt_log_extract_lognum(
    WT_SESSION_IMPL *session, const char *name, uint32_t *id)
{
	const char *p;

	WT_UNUSED(session);

	if (id == NULL || name == NULL)
		return (WT_ERROR);
	if ((p = strrchr(name, '.')) == NULL ||
	    sscanf(++p, "%" SCNu32, id) != 1)
		WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name);
	return (0);
}

/*
 * __log_zero --
 *	Zero a log file.
 */
static int
__log_zero(WT_SESSION_IMPL *session,
    WT_FH *fh, wt_off_t start_off, wt_off_t len)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(zerobuf);
	WT_DECL_RET;
	WT_LOG *log;
	uint32_t allocsize, bufsz, off, partial, wrlen;

	conn = S2C(session);
	log = conn->log;
	allocsize = log->allocsize;
	zerobuf = NULL;
	if (allocsize < WT_MEGABYTE)
		bufsz = WT_MEGABYTE;
	else
		bufsz = allocsize;
	/*
	 * If they're using smaller log files, cap it at the file size.
	 */
	if (conn->log_file_max < bufsz)
		bufsz = (uint32_t)conn->log_file_max;
	WT_RET(__wt_scr_alloc(session, bufsz, &zerobuf));
	memset(zerobuf->mem, 0, zerobuf->memsize);
	WT_STAT_FAST_CONN_INCR(session, log_zero_fills);

	/*
	 * Read in a chunk starting at the end of the file.  Keep going until
	 * we reach the beginning or we find a chunk that contains any non-zero
	 * bytes.  Compare against a known zero byte chunk.
	 */
	off = (uint32_t)start_off;
	while (off < (uint32_t)len) {
		/*
		 * Typically we start to zero the file after the log header
		 * and the bufsz is a sector-aligned size.  So we want to
		 * align our writes when we can.
		 */
		partial = off % bufsz;
		if (partial != 0)
			wrlen = bufsz - partial;
		else
			wrlen = bufsz;
		/*
		 * Check if we're writing a partial amount at the end too.
		 */
		if ((uint32_t)len - off < bufsz)
			wrlen = (uint32_t)len - off;
		WT_ERR(__wt_write(session,
		    fh, (wt_off_t)off, wrlen, zerobuf->mem));
		off += wrlen;
	}
err:	__wt_scr_free(session, &zerobuf);
	return (ret);
}

/*
 * __log_prealloc --
 *	Pre-allocate a log file.
 */
static int
__log_prealloc(WT_SESSION_IMPL *session, WT_FH *fh)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;

	conn = S2C(session);
	log = conn->log;
	ret = 0;
	/*
	 * If the user configured zero filling, pre-allocate the log file
	 * manually.  Otherwise use either fallocate or ftruncate to create
	 * and zero the log file based on what is available.
	 */
	if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ZERO_FILL))
		ret = __log_zero(session, fh,
		    WT_LOG_FIRST_RECORD, conn->log_file_max);
	else if (fh->fallocate_available == WT_FALLOCATE_NOT_AVAILABLE ||
	    (ret = __wt_fallocate(session, fh,
	    WT_LOG_FIRST_RECORD, conn->log_file_max)) == ENOTSUP)
		ret = __wt_ftruncate(session, fh,
		    WT_LOG_FIRST_RECORD + conn->log_file_max);
	return (ret);
}

/*
 * __log_size_fit --
 *	Return whether or not recsize will fit in the log file.
 */
static int
__log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;

	conn = S2C(session);
	log = conn->log;
	return (lsn->offset == WT_LOG_FIRST_RECORD ||
	    lsn->offset + (wt_off_t)recsize < conn->log_file_max);
}

/*
 * __log_decompress --
 *	Decompress a log record.
 */
static int
__log_decompress(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out)
{
	WT_COMPRESSOR *compressor;
	WT_CONNECTION_IMPL *conn;
	WT_LOG_RECORD *logrec;
	size_t result_len, skip;
	uint32_t uncompressed_size;

	conn = S2C(session);
	logrec = (WT_LOG_RECORD *)in->mem;
	skip = WT_LOG_COMPRESS_SKIP;
	compressor = conn->log_compressor;
	if (compressor == NULL || compressor->decompress == NULL)
		WT_RET_MSG(session, WT_ERROR,
		    "log_decompress: Compressed record with "
		    "no configured compressor");
	uncompressed_size = logrec->mem_len;
	WT_RET(__wt_buf_initsize(session, out, uncompressed_size));
	memcpy(out->mem, in->mem, skip);
	WT_RET(compressor->decompress(compressor, &session->iface,
	    (uint8_t *)in->mem + skip, in->size - skip,
	    (uint8_t *)out->mem + skip,
	    uncompressed_size - skip, &result_len));

	/*
	 * If checksums were turned off because we're depending on the
	 * decompression to fail on any corrupted data, we'll end up
	 * here after corruption happens.  If we're salvaging the file,
	 * it's OK, otherwise it's really, really bad.
	 */
	if (result_len != uncompressed_size - WT_LOG_COMPRESS_SKIP)
		return (WT_ERROR);

	return (0);
}

/*
 * __log_decrypt --
 *	Decrypt a log record.
 */
static int
__log_decrypt(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out)
{
	WT_CONNECTION_IMPL *conn;
	WT_ENCRYPTOR *encryptor;
	WT_KEYED_ENCRYPTOR *kencryptor;

	conn = S2C(session);
	kencryptor = conn->kencryptor;
	if (kencryptor == NULL ||
	    (encryptor = kencryptor->encryptor) == NULL ||
	    encryptor->decrypt == NULL)
		WT_RET_MSG(session, WT_ERROR,
		    "log_decrypt: Encrypted record with "
		    "no configured decrypt method");

	return (__wt_decrypt(session, encryptor, WT_LOG_ENCRYPT_SKIP, in, out));
}

/*
 * __log_fill --
 *	Copy a thread's log records into the assigned slot.
 */
static int
__log_fill(WT_SESSION_IMPL *session,
    WT_MYSLOT *myslot, bool force, WT_ITEM *record, WT_LSN *lsnp)
{
	WT_DECL_RET;
	WT_LOG_RECORD *logrec;

	logrec = (WT_LOG_RECORD *)record->mem;
	/*
	 * Call __wt_write or copy into the buffer.  For now the offset is the
	 * real byte offset.  If the offset becomes a unit of WT_LOG_ALIGN this
	 * is where we would multiply by WT_LOG_ALIGN to get the real file byte
	 * offset for write().
	 */
	if (!force && !F_ISSET(myslot, WT_MYSLOT_UNBUFFERED))
		memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
		    logrec, logrec->len);
	else
		/*
		 * If this is a force or unbuffered write, write it now.
		 */
		WT_ERR(__wt_write(session, myslot->slot->slot_fh,
		    myslot->offset + myslot->slot->slot_start_offset,
		    (size_t)logrec->len, (void *)logrec));

	WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len);
	if (lsnp != NULL) {
		*lsnp = myslot->slot->slot_start_lsn;
		lsnp->offset += (wt_off_t)myslot->offset;
	}
err:
	if (ret != 0 && myslot->slot->slot_error == 0)
		myslot->slot->slot_error = ret;
	return (ret);
}

/*
 * __log_file_header --
 *	Create and write a log file header into a file handle.  If writing
 *	into the main log, it will be called locked.  If writing into a
 *	pre-allocated log, it will be called unlocked.
 */
static int
__log_file_header(
    WT_SESSION_IMPL *session, WT_FH *fh, WT_LSN *end_lsn, bool prealloc)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(buf);
	WT_DECL_RET;
	WT_LOG *log;
	WT_LOG_DESC *desc;
	WT_LOG_RECORD *logrec;
	WT_LOGSLOT tmp;
	WT_MYSLOT myslot;

	conn = S2C(session);
	log = conn->log;

	/*
	 * Set up the log descriptor record.  Use a scratch buffer to
	 * get correct alignment for direct I/O.
	 */
	WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize);
	WT_RET(__wt_scr_alloc(session, log->allocsize, &buf));
	memset(buf->mem, 0, log->allocsize);
	logrec = (WT_LOG_RECORD *)buf->mem;
	desc = (WT_LOG_DESC *)logrec->record;
	desc->log_magic = WT_LOG_MAGIC;
	desc->majorv = WT_LOG_MAJOR_VERSION;
	desc->minorv = WT_LOG_MINOR_VERSION;
	desc->log_size = (uint64_t)conn->log_file_max;

	/*
	 * Now that the record is set up, initialize the record header.
	 */
	logrec->len = log->allocsize;
	logrec->checksum = 0;
	logrec->checksum = __wt_cksum(logrec, log->allocsize);
	WT_CLEAR(tmp);
	memset(&myslot, 0, sizeof(myslot));
	myslot.slot = &tmp;

	/*
	 * We may recursively call __wt_log_acquire to allocate log space for
	 * the log descriptor record.  Call __log_fill to write it, but we
	 * do not need to call __wt_log_release because we're not waiting for
	 * any earlier operations to complete.
	 */
	if (prealloc) {
		WT_ASSERT(session, fh != NULL);
		tmp.slot_fh = fh;
	} else {
		WT_ASSERT(session, fh == NULL);
		WT_ERR(__wt_log_acquire(session, logrec->len, &tmp));
	}
	WT_ERR(__log_fill(session, &myslot, true, buf, NULL));
	/*
	 * Make sure the header gets to disk.
	 */
	WT_ERR(__wt_fsync(session, tmp.slot_fh));
	if (end_lsn != NULL)
		*end_lsn = tmp.slot_end_lsn;

err:	__wt_scr_free(session, &buf);
	return (ret);
}

/*
 * __log_openfile --
 *	Open a log file with the given log file number and return the WT_FH.
 */
static int
__log_openfile(WT_SESSION_IMPL *session,
    bool ok_create, WT_FH **fh, const char *file_prefix, uint32_t id)
{
	WT_DECL_ITEM(buf);
	WT_DECL_RET;
	WT_LOG *log;
	WT_LOG_DESC *desc;
	WT_LOG_RECORD *logrec;
	uint32_t allocsize;

	log = S2C(session)->log;
	if (log == NULL)
		allocsize = WT_LOG_ALIGN;
	else
		allocsize = log->allocsize;
	WT_RET(__wt_scr_alloc(session, 0, &buf));
	WT_ERR(__log_filename(session, id, file_prefix, buf));
	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
	    "opening log %s", (const char *)buf->data));
	WT_ERR(__wt_open(
	    session, buf->data, ok_create, false, WT_FILE_TYPE_LOG, fh));
	/*
	 * If we are not creating the log file but opening it for reading,
	 * check that the magic number and versions are correct.
	 */
	if (!ok_create) {
		WT_ERR(__wt_buf_grow(session, buf, allocsize));
		memset(buf->mem, 0, allocsize);
		WT_ERR(__wt_read(session, *fh, 0, allocsize, buf->mem));
		logrec = (WT_LOG_RECORD *)buf->mem;
		desc = (WT_LOG_DESC *)logrec->record;
		if (desc->log_magic != WT_LOG_MAGIC)
			WT_PANIC_RET(session, WT_ERROR,
			   "log file %s corrupted: Bad magic number %" PRIu32,
			   (*fh)->name, desc->log_magic);
		if (desc->majorv > WT_LOG_MAJOR_VERSION ||
		    (desc->majorv == WT_LOG_MAJOR_VERSION &&
		    desc->minorv > WT_LOG_MINOR_VERSION))
			WT_ERR_MSG(session, WT_ERROR,
			    "unsupported WiredTiger file version: this build "
			    " only supports major/minor versions up to %d/%d, "
			    " and the file is version %d/%d",
			    WT_LOG_MAJOR_VERSION, WT_LOG_MINOR_VERSION,
			    desc->majorv, desc->minorv);
	}
err:	__wt_scr_free(session, &buf);
	return (ret);
}

/*
 * __log_alloc_prealloc --
 *	Look for a pre-allocated log file and rename it to use as the next
 *	real log file.  Called locked.
 */
static int
__log_alloc_prealloc(WT_SESSION_IMPL *session, uint32_t to_num)
{
	WT_DECL_ITEM(from_path);
	WT_DECL_ITEM(to_path);
	WT_DECL_RET;
	uint32_t from_num;
	u_int logcount;
	char **logfiles;

	/*
	 * If there are no pre-allocated files, return WT_NOTFOUND.
	 */
	logfiles = NULL;
	WT_ERR(__log_get_files(session,
	    WT_LOG_PREPNAME, &logfiles, &logcount));
	if (logcount == 0)
		return (WT_NOTFOUND);

	/*
	 * We have a file to use.  Just use the first one.
	 */
	WT_ERR(__wt_log_extract_lognum(session, logfiles[0], &from_num));

	WT_ERR(__wt_scr_alloc(session, 0, &from_path));
	WT_ERR(__wt_scr_alloc(session, 0, &to_path));
	WT_ERR(__log_filename(session, from_num, WT_LOG_PREPNAME, from_path));
	WT_ERR(__log_filename(session, to_num, WT_LOG_FILENAME, to_path));
	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
	    "log_alloc_prealloc: rename log %s to %s",
	    (char *)from_path->data, (char *)to_path->data));
	WT_STAT_FAST_CONN_INCR(session, log_prealloc_used);
	/*
	 * All file setup, writing the header and pre-allocation was done
	 * before.  We only need to rename it.
	 */
	WT_ERR(__wt_rename(session, from_path->data, to_path->data));

err:	__wt_scr_free(session, &from_path);
	__wt_scr_free(session, &to_path);
	if (logfiles != NULL)
		__wt_log_files_free(session, logfiles, logcount);
	return (ret);
}

/*
 * __log_newfile --
 *	Create the next log file and write the file header record into it.
 */
static int
__log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	WT_LSN end_lsn;
	int yield_cnt;
	bool create_log;

	conn = S2C(session);
	log = conn->log;

	create_log = true;
	yield_cnt = 0;
	/*
	 * Set aside the log file handle to be closed later.  Other threads
	 * may still be using it to write to the log.  If the log file size
	 * is small we could fill a log file before the previous one is closed.
	 * Wait for that to close.
	 */
	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
	while (log->log_close_fh != NULL) {
		WT_STAT_FAST_CONN_INCR(session, log_close_yields);
		WT_RET(__wt_log_wrlsn(session, NULL));
		if (++yield_cnt > 10000)
			return (EBUSY);
		__wt_yield();
	}
	/*
	 * Note, the file server worker thread has code that knows that
	 * the file handle is set before the LSN.  Do not reorder without
	 * also reviewing that code.
	 */
	log->log_close_fh = log->log_fh;
	if (log->log_close_fh != NULL)
		log->log_close_lsn = log->alloc_lsn;
	log->fileid++;
	/*
	 * Make sure everything we set above is visible.
	 */
	WT_FULL_BARRIER();
	/*
	 * If we're pre-allocating log files, look for one.  If there aren't any
	 * or we're not pre-allocating, or a backup cursor is open, then
	 * create one.
	 */
	if (conn->log_prealloc > 0 && !conn->hot_backup) {
		ret = __log_alloc_prealloc(session, log->fileid);
		/*
		 * If ret is 0 it means we found a pre-allocated file.
		 * If ret is non-zero but not WT_NOTFOUND, we return the error.
		 * If ret is WT_NOTFOUND, we leave create_log set and create
		 * the new log file.
		 */
		if (ret == 0)
			create_log = false;
		/*
		 * If we get any error other than WT_NOTFOUND, return it.
		 */
		WT_RET_NOTFOUND_OK(ret);

		if (create_log) {
			WT_STAT_FAST_CONN_INCR(session, log_prealloc_missed);
			if (conn->log_cond != NULL)
				WT_RET(__wt_cond_signal(
				    session, conn->log_cond));
		}
	}
	/*
	 * If we need to create the log file, do so now.
	 */
	if (create_log) {
		log->prep_missed++;
		WT_RET(__wt_log_allocfile(
		    session, log->fileid, WT_LOG_FILENAME));
	}
	WT_RET(__log_openfile(session,
	    false, &log->log_fh, WT_LOG_FILENAME, log->fileid));
	/*
	 * We need to setup the LSNs.  Set the end LSN and alloc LSN to
	 * the end of the header.
	 */
	log->alloc_lsn.file = log->fileid;
	log->alloc_lsn.offset = WT_LOG_FIRST_RECORD;
	end_lsn = log->alloc_lsn;

	/*
	 * If we're called from connection creation code, we need to update
	 * the LSNs since we're the only write in progress.
	 */
	if (conn_open) {
		WT_RET(__wt_fsync(session, log->log_fh));
		log->sync_lsn = end_lsn;
		log->write_lsn = end_lsn;
		log->write_start_lsn = end_lsn;
	}
	if (created != NULL)
		*created = create_log;
	return (0);
}

/*
 * __wt_log_acquire --
 *	Called serially when switching slots.  Can be called recursively
 *	from __log_newfile when we change log files.
 */
int
__wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	bool created_log;

	conn = S2C(session);
	log = conn->log;
	created_log = true;
	/*
	 * Add recsize to alloc_lsn.  Save our starting LSN
	 * where the previous allocation finished for the release LSN.
	 * That way when log files switch, we're waiting for the correct LSN
	 * from outstanding writes.
	 */
	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
	/*
	 * We need to set the release LSN earlier, before a log file change.
	 */
	slot->slot_release_lsn = log->alloc_lsn;
	/*
	 * Make sure that the size can fit in the file.  Proactively switch
	 * if it cannot.  This reduces, but does not eliminate, log files
	 * that exceed the maximum file size.  We want to minimize the risk
	 * of an error due to no space.
	 */
	if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
		WT_RET(__log_newfile(session, false, &created_log));
		if (log->log_close_fh != NULL)
			F_SET(slot, WT_SLOT_CLOSEFH);
	}

	/*
	 * Pre-allocate on the first real write into the log file, if it
	 * was just created (i.e. not pre-allocated).
	 */
	if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log)
		WT_RET(__log_prealloc(session, log->log_fh));
	/*
	 * Initialize the slot for activation.
	 */
	__wt_log_slot_activate(session, slot);

	return (0);
}

/*
 * __log_truncate --
 *	Truncate the log to the given LSN.  If this_log is set, it will only
 *	truncate the log file indicated in the given LSN.  If not set,
 *	it will truncate between the given LSN and the trunc_lsn.  That is,
 *	since we pre-allocate log files, it will free that space and allow the
 *	log to be traversed.  We use the trunc_lsn because logging has already
 *	opened the new/next log file before recovery ran.  This function assumes
 *	we are in recovery or other dedicated time and not during live running.
 */
static int
__log_truncate(WT_SESSION_IMPL *session,
    WT_LSN *lsn, const char *file_prefix, uint32_t this_log)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *log_fh;
	WT_LOG *log;
	uint32_t lognum;
	u_int i, logcount;
	char **logfiles;

	conn = S2C(session);
	log = conn->log;
	log_fh = NULL;
	logcount = 0;
	logfiles = NULL;

	/*
	 * Truncate the log file to the given LSN.
	 */
	WT_ERR(__log_openfile(session, false, &log_fh, file_prefix, lsn->file));
	WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset));
	WT_ERR(__wt_fsync(session, log_fh));
	WT_ERR(__wt_close(session, &log_fh));

	/*
	 * If we just want to truncate the current log, return and skip
	 * looking for intervening logs.
	 */
	if (this_log)
		goto err;
	WT_ERR(__log_get_files(session,
	    WT_LOG_FILENAME, &logfiles, &logcount));
	for (i = 0; i < logcount; i++) {
		WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
		if (lognum > lsn->file && lognum < log->trunc_lsn.file) {
			WT_ERR(__log_openfile(session,
			    false, &log_fh, file_prefix, lognum));
			/*
			 * If there are intervening files pre-allocated,
			 * truncate them to the end of the log file header.
			 */
			WT_ERR(__wt_ftruncate(session,
			    log_fh, WT_LOG_FIRST_RECORD));
			WT_ERR(__wt_fsync(session, log_fh));
			WT_ERR(__wt_close(session, &log_fh));
		}
	}
err:	WT_TRET(__wt_close(session, &log_fh));
	if (logfiles != NULL)
		__wt_log_files_free(session, logfiles, logcount);
	return (ret);
}

/*
 * __wt_log_allocfile --
 *	Given a log number, create a new log file by writing the header,
 *	pre-allocating the file and moving it to the destination name.
 */
int
__wt_log_allocfile(
    WT_SESSION_IMPL *session, uint32_t lognum, const char *dest)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(from_path);
	WT_DECL_ITEM(to_path);
	WT_DECL_RET;
	WT_FH *log_fh;
	WT_LOG *log;
	uint32_t tmp_id;

	conn = S2C(session);
	log = conn->log;
	log_fh = NULL;

	/*
	 * Preparing a log file entails creating a temporary file:
	 * - Writing the header.
	 * - Truncating to the offset of the first record.
	 * - Pre-allocating the file if needed.
	 * - Renaming it to the desired file name.
	 */
	WT_RET(__wt_scr_alloc(session, 0, &from_path));
	WT_ERR(__wt_scr_alloc(session, 0, &to_path));
	tmp_id = __wt_atomic_add32(&log->tmp_fileid, 1);
	WT_ERR(__log_filename(session, tmp_id, WT_LOG_TMPNAME, from_path));
	WT_ERR(__log_filename(session, lognum, dest, to_path));
	/*
	 * Set up the temporary file.
	 */
	WT_ERR(__log_openfile(session, true, &log_fh, WT_LOG_TMPNAME, tmp_id));
	WT_ERR(__log_file_header(session, log_fh, NULL, true));
	WT_ERR(__wt_ftruncate(session, log_fh, WT_LOG_FIRST_RECORD));
	WT_ERR(__log_prealloc(session, log_fh));
	WT_ERR(__wt_fsync(session, log_fh));
	WT_ERR(__wt_close(session, &log_fh));
	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
	    "log_prealloc: rename %s to %s",
	    (char *)from_path->data, (char *)to_path->data));
	/*
	 * Rename it into place and make it available.
	 */
	WT_ERR(__wt_rename(session, from_path->data, to_path->data));

err:	__wt_scr_free(session, &from_path);
	__wt_scr_free(session, &to_path);
	WT_TRET(__wt_close(session, &log_fh));
	return (ret);
}

/*
 * __wt_log_remove --
 *	Given a log number, remove that log file.
 */
int
__wt_log_remove(WT_SESSION_IMPL *session,
    const char *file_prefix, uint32_t lognum)
{
	WT_DECL_ITEM(path);
	WT_DECL_RET;

	WT_RET(__wt_scr_alloc(session, 0, &path));
	WT_ERR(__log_filename(session, lognum, file_prefix, path));
	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
	    "log_remove: remove log %s", (char *)path->data));
	WT_ERR(__wt_remove(session, path->data));
err:	__wt_scr_free(session, &path);
	return (ret);
}

/*
 * __wt_log_open --
 *	Open the appropriate log file for the connection.  The purpose is
 *	to find the last log file that exists, open it and set our initial
 *	LSNs to the end of that file.  If none exist, call __log_newfile
 *	to create it.
 */
int
__wt_log_open(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	uint32_t firstlog, lastlog, lognum;
	u_int i, logcount;
	char **logfiles;

	conn = S2C(session);
	log = conn->log;
	logfiles = NULL;
	logcount = 0;
	lastlog = 0;
	firstlog = UINT32_MAX;

	/*
	 * Open up a file handle to the log directory if we haven't.
	 */
	if (log->log_dir_fh == NULL) {
		WT_RET(__wt_verbose(session, WT_VERB_LOG,
		    "log_open: open fh to directory %s", conn->log_path));
		WT_RET(__wt_open(session, conn->log_path,
		    false, false, WT_FILE_TYPE_DIRECTORY, &log->log_dir_fh));
	}
	/*
	 * Clean up any old interim pre-allocated files.
	 * We clean up these files because settings have changed upon reboot
	 * and we want those settings to take effect right away.
	 */
	WT_ERR(__log_get_files(session,
	    WT_LOG_TMPNAME, &logfiles, &logcount));
	for (i = 0; i < logcount; i++) {
		WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
		WT_ERR(__wt_log_remove(session, WT_LOG_TMPNAME, lognum));
	}
	__wt_log_files_free(session, logfiles, logcount);
	logfiles = NULL;
	logcount = 0;
	WT_ERR(__log_get_files(session,
	    WT_LOG_PREPNAME, &logfiles, &logcount));
	for (i = 0; i < logcount; i++) {
		WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
		WT_ERR(__wt_log_remove(session, WT_LOG_PREPNAME, lognum));
	}
	__wt_log_files_free(session, logfiles, logcount);
	logfiles = NULL;
	/*
	 * Now look at the log files and set our LSNs.
	 */
	WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount));
	for (i = 0; i < logcount; i++) {
		WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
		lastlog = WT_MAX(lastlog, lognum);
		firstlog = WT_MIN(firstlog, lognum);
	}
	log->fileid = lastlog;
	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
	    "log_open: first log %d last log %d", firstlog, lastlog));
	if (firstlog == UINT32_MAX) {
		WT_ASSERT(session, logcount == 0);
		WT_INIT_LSN(&log->first_lsn);
	} else {
		log->first_lsn.file = firstlog;
		log->first_lsn.offset = 0;
	}

	/*
	 * Start logging at the beginning of the next log file, no matter
	 * where the previous log file ends.
	 */
	WT_WITH_SLOT_LOCK(session, log, ret,
	    ret = __log_newfile(session, true, NULL));
	WT_ERR(ret);

	/* If we found log files, save the new state. */
	if (logcount > 0) {
		log->trunc_lsn = log->alloc_lsn;
		FLD_SET(conn->log_flags, WT_CONN_LOG_EXISTED);
	}

err:	if (logfiles != NULL)
		__wt_log_files_free(session, logfiles, logcount);
	return (ret);
}

/*
 * __wt_log_close --
 *	Close the log file.
 */
int
__wt_log_close(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;

	conn = S2C(session);
	log = conn->log;

	if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) {
		WT_RET(__wt_verbose(session, WT_VERB_LOG,
		    "closing old log %s", log->log_close_fh->name));
		WT_RET(__wt_fsync(session, log->log_close_fh));
		WT_RET(__wt_close(session, &log->log_close_fh));
	}
	if (log->log_fh != NULL) {
		WT_RET(__wt_verbose(session, WT_VERB_LOG,
		    "closing log %s", log->log_fh->name));
		WT_RET(__wt_fsync(session, log->log_fh));
		WT_RET(__wt_close(session, &log->log_fh));
		log->log_fh = NULL;
	}
	if (log->log_dir_fh != NULL) {
		WT_RET(__wt_verbose(session, WT_VERB_LOG,
		    "closing log directory %s", log->log_dir_fh->name));
		WT_RET(__wt_directory_sync_fh(session, log->log_dir_fh));
		WT_RET(__wt_close(session, &log->log_dir_fh));
		log->log_dir_fh = NULL;
	}
	return (0);
}

/*
 * __log_has_hole --
 *	Determine if the current offset represents a hole in the log
 *	file (i.e. there is valid data somewhere after the hole), or
 *	if this is the end of this log file and the remainder of the
 *	file is zeroes.
 */
static int
__log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, bool *hole)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	wt_off_t log_size, off, remainder;
	size_t bufsz, rdlen;
	char *buf, *zerobuf;

	conn = S2C(session);
	log = conn->log;
	log_size = fh->size;
	remainder = log_size - offset;
	*hole = false;

	/*
	 * It can be very slow looking for the last real record in the log
	 * in very small chunks.  Walk a megabyte at a time.  If we find a
	 * part of the log that is not just zeroes we know this log file
	 * has a hole in it.
	 */
	buf = zerobuf = NULL;
	if (log == NULL || log->allocsize < WT_MEGABYTE)
		bufsz = WT_MEGABYTE;
	else
		bufsz = log->allocsize;

	if ((size_t)remainder < bufsz)
		bufsz = (size_t)remainder;
	WT_RET(__wt_calloc_def(session, bufsz, &buf));
	WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf));

	/*
	 * Read in a chunk starting at the given offset.
	 * Compare against a known zero byte chunk.
	 */
	for (off = offset; remainder > 0;
	    remainder -= (wt_off_t)rdlen, off += (wt_off_t)rdlen) {
		rdlen = WT_MIN(bufsz, (size_t)remainder);
		WT_ERR(__wt_read(session, fh, off, rdlen, buf));
		if (memcmp(buf, zerobuf, rdlen) != 0) {
			*hole = true;
			break;
		}
	}

err:	if (buf != NULL)
		__wt_free(session, buf);
	if (zerobuf != NULL)
		__wt_free(session, zerobuf);
	return (ret);
}

/*
 * __wt_log_release --
 *	Release a log slot.
 */
int
__wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	WT_LSN sync_lsn;
	int64_t release_buffered, release_bytes;
	int yield_count;
	bool locked;

	conn = S2C(session);
	log = conn->log;
	locked = false;
	yield_count = 0;
	if (freep != NULL)
		*freep = 1;
	release_buffered =
	    WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state);
	release_bytes = release_buffered + slot->slot_unbuffered;

	/*
	 * Checkpoints can be configured based on amount of log written.
	 * Add in this log record to the sum and if needed, signal the
	 * checkpoint condition.  The logging subsystem manages the
	 * accumulated field.  There is a bit of layering violation
	 * here checking the connection ckpt field and using its
	 * condition.
	 */
	if (WT_CKPT_LOGSIZE(conn)) {
		log->log_written += (wt_off_t)release_bytes;
		WT_RET(__wt_checkpoint_signal(session, log->log_written));
	}

	/* Write the buffered records */
	if (release_buffered != 0)
		WT_ERR(__wt_write(session,
		    slot->slot_fh, slot->slot_start_offset,
		    (size_t)release_buffered, slot->slot_buf.mem));

	/*
	 * If we have to wait for a synchronous operation, we do not pass
	 * handling of this slot off to the worker thread.  The caller is
	 * responsible for freeing the slot in that case.  Otherwise the
	 * worker thread will free it.
	 */
	if (!F_ISSET(slot, WT_SLOT_FLUSH | WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
		if (freep != NULL)
			*freep = 0;
		slot->slot_state = WT_LOG_SLOT_WRITTEN;
		/*
		 * After this point the worker thread owns the slot.  There
		 * is nothing more to do but return.
		 */
		/*
		 * !!! Signalling the wrlsn_cond condition here results in
		 * worse performance because it causes more scheduling churn
		 * and more walking of the slot pool for a very small number
		 * of slots to process.  Don't signal here.
		 */
		return (0);
	}

	/*
	 * Wait for earlier groups to finish, otherwise there could
	 * be holes in the log file.
	 */
	WT_STAT_FAST_CONN_INCR(session, log_release_write_lsn);
	while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) {
		/*
		 * If we're on a locked path and the write LSN is not advancing,
		 * unlock in case an earlier thread is trying to switch its
		 * slot and complete its operation.
		 */
		if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
			__wt_spin_unlock(session, &log->log_slot_lock);
		WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond));
		if (++yield_count < WT_THOUSAND)
			__wt_yield();
		else
			ret = __wt_cond_wait(session, log->log_write_cond, 200);
		if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
			__wt_spin_lock(session, &log->log_slot_lock);
		WT_ERR(ret);
	}

	log->write_start_lsn = slot->slot_start_lsn;
	log->write_lsn = slot->slot_end_lsn;

	WT_ASSERT(session, slot != log->active_slot);
	WT_ERR(__wt_cond_signal(session, log->log_write_cond));
	F_CLR(slot, WT_SLOT_FLUSH);

	/*
	 * Signal the close thread if needed.
	 */
	if (F_ISSET(slot, WT_SLOT_CLOSEFH))
		WT_ERR(__wt_cond_signal(session, conn->log_file_cond));

	/*
	 * Try to consolidate calls to fsync to wait less.  Acquire a spin lock
	 * so that threads finishing writing to the log will wait while the
	 * current fsync completes and advance log->sync_lsn.
	 */
	while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
		/*
		 * We have to wait until earlier log files have finished their
		 * sync operations.  The most recent one will set the LSN to the
		 * beginning of our file.
		 */
		if (log->sync_lsn.file < slot->slot_end_lsn.file ||
		    __wt_spin_trylock(session, &log->log_sync_lock) != 0) {
			WT_ERR(__wt_cond_wait(
			    session, log->log_sync_cond, 10000));
			continue;
		}
		locked = true;

		/*
		 * Record the current end of our update after the lock.
		 * That is how far our calls can guarantee.
		 */
		sync_lsn = slot->slot_end_lsn;
		/*
		 * Check if we have to sync the parent directory.  Some
		 * combinations of sync flags may result in the log file
		 * not yet stable in its parent directory.  Do that
		 * now if needed.
		 */
		if (F_ISSET(slot, WT_SLOT_SYNC_DIR) &&
		    (log->sync_dir_lsn.file < sync_lsn.file)) {
			WT_ASSERT(session, log->log_dir_fh != NULL);
			WT_ERR(__wt_verbose(session, WT_VERB_LOG,
			    "log_release: sync directory %s to LSN %d/%lu",
			    log->log_dir_fh->name,
			    sync_lsn.file, sync_lsn.offset));
			WT_ERR(__wt_directory_sync_fh(
			    session, log->log_dir_fh));
			log->sync_dir_lsn = sync_lsn;
			WT_STAT_FAST_CONN_INCR(session, log_sync_dir);
		}

		/*
		 * Sync the log file if needed.
		 */
		if (F_ISSET(slot, WT_SLOT_SYNC) &&
		    __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
			WT_ERR(__wt_verbose(session, WT_VERB_LOG,
			    "log_release: sync log %s to LSN %d/%lu",
			    log->log_fh->name, sync_lsn.file, sync_lsn.offset));
			WT_STAT_FAST_CONN_INCR(session, log_sync);
			WT_ERR(__wt_fsync(session, log->log_fh));
			log->sync_lsn = sync_lsn;
			WT_ERR(__wt_cond_signal(session, log->log_sync_cond));
		}
		/*
		 * Clear the flags before leaving the loop.
		 */
		F_CLR(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR);
		locked = false;
		__wt_spin_unlock(session, &log->log_sync_lock);
	}
err:	if (locked)
		__wt_spin_unlock(session, &log->log_sync_lock);
	if (ret != 0 && slot->slot_error == 0)
		slot->slot_error = ret;
	return (ret);
}

/*
 * __wt_log_scan --
 *	Scan the logs, calling a function on each record found.
 */
int
__wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
    int (*func)(WT_SESSION_IMPL *session,
    WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp,
    void *cookie, int firstrecord), void *cookie)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(buf);
	WT_DECL_ITEM(decryptitem);
	WT_DECL_ITEM(uncitem);
	WT_DECL_RET;
	WT_FH *log_fh;
	WT_ITEM *cbbuf;
	WT_LOG *log;
	WT_LOG_RECORD *logrec;
	WT_LSN end_lsn, next_lsn, rd_lsn, start_lsn;
	wt_off_t log_size;
	uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen;
	u_int i, logcount;
	int firstrecord;
	bool eol;
	char **logfiles;

	conn = S2C(session);
	log = conn->log;
	log_fh = NULL;
	logcount = 0;
	logfiles = NULL;
	eol = false;
	firstrecord = 1;

	/*
	 * If the caller did not give us a callback function there is nothing
	 * to do.
	 */
	if (func == NULL)
		return (0);

	if (LF_ISSET(WT_LOGSCAN_RECOVER))
		WT_RET(__wt_verbose(session, WT_VERB_LOG,
		    "__wt_log_scan truncating to %u/%" PRIuMAX,
		    log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset));

	if (log != NULL) {
		allocsize = log->allocsize;

		if (lsnp == NULL) {
			if (LF_ISSET(WT_LOGSCAN_FIRST))
				start_lsn = log->first_lsn;
			else if (LF_ISSET(WT_LOGSCAN_FROM_CKP))
				start_lsn = log->ckpt_lsn;
			else
				return (WT_ERROR);	/* Illegal usage */
		} else {
			if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP))
				WT_RET_MSG(session, WT_ERROR,
			    "choose either a start LSN or a start flag");

			/* Offsets must be on allocation boundaries. */
			if (lsnp->offset % allocsize != 0 ||
			    lsnp->file > log->fileid)
				return (WT_NOTFOUND);

			/*
			 * Log cursors may not know the starting LSN.  If an
			 * LSN is passed in that it is equal to the smallest
			 * LSN, start from the beginning of the log.
			 */
			start_lsn = *lsnp;
			if (WT_IS_INIT_LSN(&start_lsn))
				start_lsn = log->first_lsn;
		}
		end_lsn = log->alloc_lsn;
	} else {
		/*
		 * If logging is not configured, we can still print out the log
		 * if log files exist.  We just need to set the LSNs from what
		 * is in the files versus what is in the live connection.
		 */
		/*
		 * Set allocsize to the minimum alignment it could be.  Larger
		 * records and larger allocation boundaries should always be
		 * a multiple of this.
		 */
		allocsize = WT_LOG_ALIGN;
		lastlog = 0;
		firstlog = UINT32_MAX;
		WT_RET(__log_get_files(session,
		    WT_LOG_FILENAME, &logfiles, &logcount));
		if (logcount == 0)
			/*
			 * Return it is not supported if none don't exist.
			 */
			return (ENOTSUP);
		for (i = 0; i < logcount; i++) {
			WT_ERR(__wt_log_extract_lognum(session, logfiles[i],
			    &lognum));
			lastlog = WT_MAX(lastlog, lognum);
			firstlog = WT_MIN(firstlog, lognum);
		}
		start_lsn.file = firstlog;
		end_lsn.file = lastlog;
		start_lsn.offset = end_lsn.offset = 0;
		__wt_log_files_free(session, logfiles, logcount);
		logfiles = NULL;
	}
	WT_ERR(__log_openfile(
	    session, false, &log_fh, WT_LOG_FILENAME, start_lsn.file));
	WT_ERR(__wt_filesize(session, log_fh, &log_size));
	rd_lsn = start_lsn;

	WT_ERR(__wt_scr_alloc(session, WT_LOG_ALIGN, &buf));
	WT_ERR(__wt_scr_alloc(session, 0, &decryptitem));
	WT_ERR(__wt_scr_alloc(session, 0, &uncitem));
	for (;;) {
		if (rd_lsn.offset + allocsize > log_size) {
advance:
			/*
			 * If we read the last record, go to the next file.
			 */
			WT_ERR(__wt_close(session, &log_fh));
			log_fh = NULL;
			eol = true;
			/*
			 * Truncate this log file before we move to the next.
			 */
			if (LF_ISSET(WT_LOGSCAN_RECOVER))
				WT_ERR(__log_truncate(session,
				    &rd_lsn, WT_LOG_FILENAME, 1));
			rd_lsn.file++;
			rd_lsn.offset = 0;
			/*
			 * Avoid an error message when we reach end of log
			 * by checking here.
			 */
			if (rd_lsn.file > end_lsn.file)
				break;
			WT_ERR(__log_openfile(session,
			    false, &log_fh, WT_LOG_FILENAME, rd_lsn.file));
			WT_ERR(__wt_filesize(session, log_fh, &log_size));
			eol = false;
			continue;
		}
		/*
		 * Read the minimum allocation size a record could be.
		 */
		WT_ASSERT(session, buf->memsize >= allocsize);
		WT_ERR(__wt_read(session,
		    log_fh, rd_lsn.offset, (size_t)allocsize, buf->mem));
		/*
		 * First 4 bytes is the real record length.  See if we
		 * need to read more than the allocation size.  We expect
		 * that we rarely will have to read more.  Most log records
		 * will be fairly small.
		 */
		reclen = *(uint32_t *)buf->mem;
		/*
		 * Log files are pre-allocated.  We need to detect the
		 * difference between a hole in the file (where this location
		 * would be considered the end of log) and the last record
		 * in the log and we're at the zeroed part of the file.
		 * If we find a zeroed record, scan forward in the log looking
		 * for any data.  If we detect any we have a hole and stop.
		 * Otherwise if the rest is all zeroes advance to the next file.
		 * When recovery finds the end of the log, truncate the file
		 * and remove any later log files that may exist.
		 */
		if (reclen == 0) {
			WT_ERR(__log_has_hole(
			    session, log_fh, rd_lsn.offset, &eol));
			if (eol)
				/* Found a hole. This LSN is the end. */
				break;
			else
				/* Last record in log.  Look for more. */
				goto advance;
		}
		rdup_len = __wt_rduppo2(reclen, allocsize);
		if (reclen > allocsize) {
			/*
			 * The log file end could be the middle of this
			 * log record.
			 */
			if (rd_lsn.offset + rdup_len > log_size)
				goto advance;
			/*
			 * We need to round up and read in the full padded
			 * record, especially for direct I/O.
			 */
			WT_ERR(__wt_buf_grow(session, buf, rdup_len));
			WT_ERR(__wt_read(session,
			    log_fh, rd_lsn.offset, (size_t)rdup_len, buf->mem));
			WT_STAT_FAST_CONN_INCR(session, log_scan_rereads);
		}
		/*
		 * We read in the record, verify checksum.
		 */
		buf->size = reclen;
		logrec = (WT_LOG_RECORD *)buf->mem;
		cksum = logrec->checksum;
		logrec->checksum = 0;
		logrec->checksum = __wt_cksum(logrec, logrec->len);
		if (logrec->checksum != cksum) {
			/*
			 * A checksum mismatch means we have reached the end of
			 * the useful part of the log.  This should be found on
			 * the first pass through recovery.  In the second pass
			 * where we truncate the log, this is where it should
			 * end.
			 */
			if (log != NULL)
				log->trunc_lsn = rd_lsn;
			/*
			 * If the user asked for a specific LSN and it is not
			 * a valid LSN, return WT_NOTFOUND.
			 */
			if (LF_ISSET(WT_LOGSCAN_ONE))
				ret = WT_NOTFOUND;
			break;
		}

		/*
		 * We have a valid log record.  If it is not the log file
		 * header, invoke the callback.
		 */
		WT_STAT_FAST_CONN_INCR(session, log_scan_records);
		next_lsn = rd_lsn;
		next_lsn.offset += (wt_off_t)rdup_len;
		if (rd_lsn.offset != 0) {
			/*
			 * We need to manage the different buffers here.
			 * Buf is the buffer this function uses to read from
			 * the disk.  The callback buffer may change based
			 * on whether encryption and compression are used.
			 *
			 * We want to free any buffers from compression and
			 * encryption but keep the one we use for reading.
			 */
			cbbuf = buf;
			if (F_ISSET(logrec, WT_LOG_RECORD_ENCRYPTED)) {
				WT_ERR(__log_decrypt(
				    session, cbbuf, decryptitem));
				cbbuf = decryptitem;
			}
			if (F_ISSET(logrec, WT_LOG_RECORD_COMPRESSED)) {
				WT_ERR(__log_decompress(
				    session, cbbuf, uncitem));
				cbbuf = uncitem;
			}
			WT_ERR((*func)(session,
			    cbbuf, &rd_lsn, &next_lsn, cookie, firstrecord));

			firstrecord = 0;

			if (LF_ISSET(WT_LOGSCAN_ONE))
				break;
		}
		rd_lsn = next_lsn;
	}

	/* Truncate if we're in recovery. */
	if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
	    __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0)
		WT_ERR(__log_truncate(session,
		    &rd_lsn, WT_LOG_FILENAME, 0));

err:	WT_STAT_FAST_CONN_INCR(session, log_scans);

	if (logfiles != NULL)
		__wt_log_files_free(session, logfiles, logcount);

	__wt_scr_free(session, &buf);
	__wt_scr_free(session, &decryptitem);
	__wt_scr_free(session, &uncitem);

	/*
	 * If the caller wants one record and it is at the end of log,
	 * return WT_NOTFOUND.
	 */
	if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0)
		ret = WT_NOTFOUND;
	WT_TRET(__wt_close(session, &log_fh));
	return (ret);
}

/*
 * __wt_log_force_write --
 *	Force a switch and release and write of the current slot.
 *	Wrapper function that takes the lock.
 */
int
__wt_log_force_write(WT_SESSION_IMPL *session, bool retry)
{
	WT_LOG *log;
	WT_MYSLOT myslot;

	log = S2C(session)->log;
	memset(&myslot, 0, sizeof(myslot));
	myslot.slot = log->active_slot;
	return (__wt_log_slot_switch(session, &myslot, retry, true));
}

/*
 * __wt_log_write --
 *	Write a record into the log, compressing as necessary.
 */
int
__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
    uint32_t flags)
{
	WT_COMPRESSOR *compressor;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(citem);
	WT_DECL_ITEM(eitem);
	WT_DECL_RET;
	WT_ITEM *ip;
	WT_KEYED_ENCRYPTOR *kencryptor;
	WT_LOG *log;
	WT_LOG_RECORD *newlrp;
	int compression_failed;
	size_t dst_len, len, new_size, result_len, src_len;
	uint8_t *dst, *src;

	conn = S2C(session);
	log = conn->log;
	/*
	 * An error during opening the logging subsystem can result in it
	 * being enabled, but without an open log file.  In that case,
	 * just return.
	 */
	if (log->log_fh == NULL)
		return (0);
	ip = record;
	if ((compressor = conn->log_compressor) != NULL &&
	    record->size < log->allocsize) {
		WT_STAT_FAST_CONN_INCR(session, log_compress_small);
	} else if (compressor != NULL) {
		/* Skip the log header */
		src = (uint8_t *)record->mem + WT_LOG_COMPRESS_SKIP;
		src_len = record->size - WT_LOG_COMPRESS_SKIP;

		/*
		 * Compute the size needed for the destination buffer.  We only
		 * allocate enough memory for a copy of the original by default,
		 * if any compressed version is bigger than the original, we
		 * won't use it.  However, some compression engines (snappy is
		 * one example), may need more memory because they don't stop
		 * just because there's no more memory into which to compress.
		 */
		if (compressor->pre_size == NULL)
			len = src_len;
		else
			WT_ERR(compressor->pre_size(compressor,
			    &session->iface, src, src_len, &len));

		new_size = len + WT_LOG_COMPRESS_SKIP;
		WT_ERR(__wt_scr_alloc(session, new_size, &citem));

		/* Skip the header bytes of the destination data. */
		dst = (uint8_t *)citem->mem + WT_LOG_COMPRESS_SKIP;
		dst_len = len;

		compression_failed = 0;
		WT_ERR(compressor->compress(compressor, &session->iface,
		    src, src_len, dst, dst_len, &result_len,
		    &compression_failed));
		result_len += WT_LOG_COMPRESS_SKIP;

		/*
		 * If compression fails, or doesn't gain us at least one unit of
		 * allocation, fallback to the original version.  This isn't
		 * unexpected: if compression doesn't work for some chunk of
		 * data for some reason (noting likely additional format/header
		 * information which compressed output requires), it just means
		 * the uncompressed version is as good as it gets, and that's
		 * what we use.
		 */
		if (compression_failed ||
		    result_len / log->allocsize >=
		    record->size / log->allocsize)
			WT_STAT_FAST_CONN_INCR(session,
			    log_compress_write_fails);
		else {
			WT_STAT_FAST_CONN_INCR(session, log_compress_writes);
			WT_STAT_FAST_CONN_INCRV(session, log_compress_mem,
			    record->size);
			WT_STAT_FAST_CONN_INCRV(session, log_compress_len,
			    result_len);

			/*
			 * Copy in the skipped header bytes, set the final data
			 * size.
			 */
			memcpy(citem->mem, record->mem, WT_LOG_COMPRESS_SKIP);
			citem->size = result_len;
			ip = citem;
			newlrp = (WT_LOG_RECORD *)citem->mem;
			F_SET(newlrp, WT_LOG_RECORD_COMPRESSED);
			WT_ASSERT(session, result_len < UINT32_MAX &&
			    record->size < UINT32_MAX);
			newlrp->mem_len = WT_STORE_SIZE(record->size);
		}
	}
	if ((kencryptor = conn->kencryptor) != NULL) {
		/*
		 * Allocate enough space for the original record plus the
		 * encryption size constant plus the length we store.
		 */
		__wt_encrypt_size(session, kencryptor, ip->size, &new_size);
		WT_ERR(__wt_scr_alloc(session, new_size, &eitem));

		WT_ERR(__wt_encrypt(session, kencryptor,
		    WT_LOG_ENCRYPT_SKIP, ip, eitem));

		/*
		 * Final setup of new buffer.  Set the flag for
		 * encryption in the record header.
		 */
		ip = eitem;
		newlrp = (WT_LOG_RECORD *)eitem->mem;
		F_SET(newlrp, WT_LOG_RECORD_ENCRYPTED);
		WT_ASSERT(session, new_size < UINT32_MAX &&
		    ip->size < UINT32_MAX);
	}
	ret = __log_write_internal(session, ip, lsnp, flags);

err:	__wt_scr_free(session, &citem);
	__wt_scr_free(session, &eitem);
	return (ret);
}

/*
 * __log_write_internal --
 *	Write a record into the log.
 */
static int
__log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
    uint32_t flags)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	WT_LOG_RECORD *logrec;
	WT_LSN lsn;
	WT_MYSLOT myslot;
	int64_t release_size;
	uint32_t force, rdup_len;
	bool free_slot;

	conn = S2C(session);
	log = conn->log;
	if (record->size > UINT32_MAX)
		WT_RET_MSG(session, EFBIG,
		    "Log record size of %" WT_SIZET_FMT " exceeds the maximum "
		    "supported size of %" PRIu32,
		    record->size, UINT32_MAX);
	WT_INIT_LSN(&lsn);
	myslot.slot = NULL;
	memset(&myslot, 0, sizeof(myslot));
	/*
	 * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a
	 * header at the beginning for us to fill in.
	 *
	 * If using direct_io, the caller should pass us an aligned record.
	 * But we need to make sure it is big enough and zero-filled so
	 * that we can write the full amount.  Do this whether or not
	 * direct_io is in use because it makes the reading code cleaner.
	 */
	WT_STAT_FAST_CONN_INCRV(session, log_bytes_payload, record->size);
	rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize);
	WT_ERR(__wt_buf_grow(session, record, rdup_len));
	WT_ASSERT(session, record->data == record->mem);
	/*
	 * If the caller's record only partially fills the necessary
	 * space, we need to zero-fill the remainder.
	 */
	if (record->size != rdup_len) {
		memset((uint8_t *)record->mem + record->size, 0,
		    rdup_len - record->size);
		record->size = rdup_len;
	}
	logrec = (WT_LOG_RECORD *)record->mem;
	logrec->len = (uint32_t)record->size;
	logrec->checksum = 0;
	logrec->checksum = __wt_cksum(logrec, record->size);

	WT_STAT_FAST_CONN_INCR(session, log_writes);

	__wt_log_slot_join(session, rdup_len, flags, &myslot);
	/*
	 * If the addition of this record crosses the buffer boundary,
	 * switch in a new slot.
	 */
	force = LF_ISSET(WT_LOG_FLUSH | WT_LOG_FSYNC);
	ret = 0;
	if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX ||
	    F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force)
		ret = __wt_log_slot_switch(session, &myslot, true, false);
	if (ret == 0)
		ret = __log_fill(session, &myslot, false, record, &lsn);
	release_size = __wt_log_slot_release(
	    session, &myslot, (int64_t)rdup_len);
	/*
	 * If we get an error we still need to do proper accounting in
	 * the slot fields.
	 * XXX On error we may still need to call release and free.
	 */
	if (ret != 0)
		myslot.slot->slot_error = ret;
	WT_ASSERT(session, ret == 0);
	if (WT_LOG_SLOT_DONE(release_size)) {
		WT_ERR(__wt_log_release(session, myslot.slot, &free_slot));
		if (free_slot)
			__wt_log_slot_free(session, myslot.slot);
	} else if (force) {
		/*
		 * If we are going to wait for this slot to get written,
		 * signal the wrlsn thread.
		 *
		 * XXX I've seen times when conditions are NULL.
		 */
		if (conn->log_cond != NULL) {
			WT_ERR(__wt_cond_signal(session, conn->log_cond));
			__wt_yield();
		} else
			WT_ERR(__wt_log_force_write(session, 1));
	}
	if (LF_ISSET(WT_LOG_FLUSH)) {
		/* Wait for our writes to reach the OS */
		while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 &&
		    myslot.slot->slot_error == 0)
			(void)__wt_cond_wait(
			    session, log->log_write_cond, 10000);
	} else if (LF_ISSET(WT_LOG_FSYNC)) {
		/* Wait for our writes to reach disk */
		while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 &&
		    myslot.slot->slot_error == 0)
			(void)__wt_cond_wait(
			    session, log->log_sync_cond, 10000);
	}

	/*
	 * Advance the background sync LSN if needed.
	 */
	if (LF_ISSET(WT_LOG_BACKGROUND))
		WT_ERR(__wt_log_background(session, &lsn));

err:
	if (ret == 0 && lsnp != NULL)
		*lsnp = lsn;
	/*
	 * If we're synchronous and some thread had an error, we don't know
	 * if our write made it out to the file or not.  The error could be
	 * before or after us.  So, if anyone got an error, we report it.
	 * If we're not synchronous, only report if our own operation got
	 * an error.
	 */
	if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 &&
	    myslot.slot != NULL)
		ret = myslot.slot->slot_error;

	/*
	 * If one of the sync flags is set, assert the proper LSN has moved to
	 * match.
	 */
	WT_ASSERT(session, !LF_ISSET(WT_LOG_FLUSH) ||
	    __wt_log_cmp(&log->write_lsn, &lsn) >= 0);
	WT_ASSERT(session,
	    !LF_ISSET(WT_LOG_FSYNC) || __wt_log_cmp(&log->sync_lsn, &lsn) >= 0);
	return (ret);
}

/*
 * __wt_log_vprintf --
 *	Write a message into the log.
 */
int
__wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(logrec);
	WT_DECL_RET;
	va_list ap_copy;
	const char *rec_fmt = WT_UNCHECKED_STRING(I);
	uint32_t rectype = WT_LOGREC_MESSAGE;
	size_t header_size, len;

	conn = S2C(session);

	if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
		return (0);

	va_copy(ap_copy, ap);
	len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + 1;
	va_end(ap_copy);

	WT_RET(
	    __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec));

	/*
	 * We're writing a record with the type (an integer) followed by a
	 * string (NUL-terminated data).  To avoid writing the string into
	 * a buffer before copying it, we write the header first, then the
	 * raw bytes of the string.
	 */
	WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype));
	WT_ERR(__wt_struct_pack(session,
	    (uint8_t *)logrec->data + logrec->size, header_size,
	    rec_fmt, rectype));
	logrec->size += (uint32_t)header_size;

	(void)vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap);

	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
	    "log_printf: %s", (char *)logrec->data + logrec->size));

	logrec->size += len;
	WT_ERR(__wt_log_write(session, logrec, NULL, 0));
err:	__wt_scr_free(session, &logrec);
	return (ret);
}

/*
 * __wt_log_flush --
 *	Forcibly flush the log to the synchronization level specified.
 *	Wait until it has been completed.
 */
int
__wt_log_flush(WT_SESSION_IMPL *session, uint32_t flags)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	WT_LSN last_lsn, lsn;

	conn = S2C(session);
	WT_ASSERT(session, FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED));
	log = conn->log;
	/*
	 * We need to flush out the current slot first to get the real
	 * end of log LSN in log->alloc_lsn.
	 */
	WT_RET(__wt_log_flush_lsn(session, &lsn, 0));
	last_lsn = log->alloc_lsn;

	/*
	 * Wait until all current outstanding writes have been written
	 * to the file system.
	 */
	while (__wt_log_cmp(&last_lsn, &lsn) > 0)
		WT_RET(__wt_log_flush_lsn(session, &lsn, false));

	WT_RET(__wt_verbose(session, WT_VERB_LOG,
	    "log_flush: flags %d LSN %d/%lu", flags, lsn.file, lsn.offset));
	/*
	 * If the user wants write-no-sync, there is nothing more to do.
	 * If the user wants background sync, set the LSN and we're done.
	 * If the user wants sync, force it now.
	 */
	if (LF_ISSET(WT_LOG_BACKGROUND))
		WT_RET(__wt_log_background(session, &lsn));
	else if (LF_ISSET(WT_LOG_FSYNC))
		WT_RET(__wt_log_force_sync(session, &lsn));
	return (0);
}