10 files changed, 2579 insertions, 2168 deletions
diff --git a/bdb/log/log.c b/bdb/log/log.c
index 8ddb7bcaf7d..f57caeccb95 100644
--- a/bdb/log/log.c
+++ b/bdb/log/log.c
@@ -1,40 +1,34 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: log.c,v 11.42 2001/01/15 16:42:37 bostic Exp $";
+static const char revid[] = "$Id: log.c,v 11.111 2002/08/16 00:27:44 ubell Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
+#include <ctype.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "log.h"
-#include "db_dispatch.h"
-#include "txn.h"
-#include "txn_auto.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
 
 static int __log_init __P((DB_ENV *, DB_LOG *));
 static int __log_recover __P((DB_LOG *));
+static size_t __log_region_size __P((DB_ENV *));
+static int __log_zero __P((DB_ENV *, DB_LSN *, DB_LSN *));
 
 /*
  * __log_open --
@@ -49,16 +43,10 @@ __log_open(dbenv)
 	DB_LOG *dblp;
 	LOG *lp;
 	int ret;
-	u_int8_t *readbufp;
-
-	readbufp = NULL;
 
 	/* Create/initialize the DB_LOG structure. */
 	if ((ret = __os_calloc(dbenv, 1, sizeof(DB_LOG), &dblp)) != 0)
 		return (ret);
-	if ((ret = __os_calloc(dbenv, 1, dbenv->lg_bsize, &readbufp)) != 0)
-		goto err;
-	ZERO_LSN(dblp->c_lsn);
 	dblp->dbenv = dbenv;
 
 	/* Join/create the log region. */
@@ -69,40 +57,66 @@ __log_open(dbenv)
 	if (F_ISSET(dbenv, DB_ENV_CREATE))
 		F_SET(&dblp->reginfo, REGION_CREATE_OK);
 	if ((ret = __db_r_attach(
-	    dbenv, &dblp->reginfo, LG_BASE_REGION_SIZE + dbenv->lg_bsize)) != 0)
+	    dbenv, &dblp->reginfo, __log_region_size(dbenv))) != 0)
 		goto err;
 
-	dblp->readbufp = readbufp;
-
 	/* If we created the region, initialize it. */
-	if (F_ISSET(&dblp->reginfo, REGION_CREATE) &&
-	    (ret = __log_init(dbenv, dblp)) != 0)
-		goto err;
+	if (F_ISSET(&dblp->reginfo, REGION_CREATE))
+		if ((ret = __log_init(dbenv, dblp)) != 0)
+			goto err;
 
 	/* Set the local addresses. */
 	lp = dblp->reginfo.primary =
 	    R_ADDR(&dblp->reginfo, dblp->reginfo.rp->primary);
-	dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off);
 
 	/*
 	 * If the region is threaded, then we have to lock both the handles
 	 * and the region, and we need to allocate a mutex for that purpose.
 	 */
-	if (F_ISSET(dbenv, DB_ENV_THREAD)) {
-		if ((ret = __db_mutex_alloc(
-		    dbenv, &dblp->reginfo, &dblp->mutexp)) != 0)
-			goto err;
-		if ((ret = __db_mutex_init(
-		    dbenv, dblp->mutexp, 0, MUTEX_THREAD)) != 0)
+	if (F_ISSET(dbenv, DB_ENV_THREAD) &&
+	    (ret = __db_mutex_setup(dbenv, &dblp->reginfo, &dblp->mutexp,
+	    MUTEX_ALLOC | MUTEX_NO_RLOCK)) != 0)
+		goto err;
+
+	/* Initialize the rest of the structure. */
+	dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off);
+
+	/*
+	 * Set the handle -- we may be about to run recovery, which allocates
+	 * log cursors.  Log cursors require logging be already configured,
+	 * and the handle being set is what demonstrates that.
+	 *
+	 * If we created the region, run recovery.  If that fails, make sure
+	 * we reset the log handle before cleaning up, otherwise we will try
+	 * and clean up again in the mainline DB_ENV initialization code.
+	 */
+	dbenv->lg_handle = dblp;
+
+	if (F_ISSET(&dblp->reginfo, REGION_CREATE)) {
+		if ((ret = __log_recover(dblp)) != 0) {
+			dbenv->lg_handle = NULL;
 			goto err;
+		}
+
+		/*
+		 * We first take the log file size from the environment, if
+		 * specified.  If that wasn't set, recovery may have set it
+		 * from the persistent information in a log file header.  If
+		 * that didn't set it either, we default.
+		 */
+		if (lp->log_size == 0)
+			lp->log_size = lp->log_nsize = LG_MAX_DEFAULT;
+	} else {
+		/*
+		 * A process joining the region may have reset the log file
+		 * size, too.  If so, it only affects the next log file we
+		 * create.
+		 */
+		 if (dbenv->lg_size != 0)
+			lp->log_nsize = dbenv->lg_size;
 	}
 
 	R_UNLOCK(dbenv, &dblp->reginfo);
-
-	dblp->r_file = 0;
-	dblp->r_off = 0;
-	dblp->r_size = 0;
-	dbenv->lg_handle = dblp;
 	return (0);
 
 err:	if (dblp->reginfo.addr != NULL) {
@@ -112,11 +126,11 @@ err:	if (dblp->reginfo.addr != NULL) {
 		(void)__db_r_detach(dbenv, &dblp->reginfo, 0);
 	}
 
-	if (readbufp != NULL)
-		__os_free(readbufp, dbenv->lg_bsize);
 	if (dblp->mutexp != NULL)
 		__db_mutex_free(dbenv, &dblp->reginfo, dblp->mutexp);
-	__os_free(dblp, sizeof(*dblp));
+
+	__os_free(dbenv, dblp);
+
 	return (ret);
 }
 
@@ -129,9 +143,13 @@ __log_init(dbenv, dblp)
 	DB_ENV *dbenv;
 	DB_LOG *dblp;
 {
+	DB_MUTEX *flush_mutexp;
 	LOG *region;
 	int ret;
 	void *p;
+#ifdef  HAVE_MUTEX_SYSTEM_RESOURCES
+	u_int8_t *addr;
+#endif
 
 	if ((ret = __db_shalloc(dblp->reginfo.addr,
 	    sizeof(*region), 0, &dblp->reginfo.primary)) != 0)
@@ -141,15 +159,55 @@ __log_init(dbenv, dblp)
 	region = dblp->reginfo.primary;
 	memset(region, 0, sizeof(*region));
 
-	region->persist.lg_max = dbenv->lg_max;
-	region->persist.magic = DB_LOGMAGIC;
-	region->persist.version = DB_LOGVERSION;
-	region->persist.mode = dbenv->db_mode;
+	region->fid_max = 0;
 	SH_TAILQ_INIT(&region->fq);
+	region->free_fid_stack = INVALID_ROFF;
+	region->free_fids = region->free_fids_alloced = 0;
 
 	/* Initialize LOG LSNs. */
-	region->lsn.file = 1;
-	region->lsn.offset = 0;
+	INIT_LSN(region->lsn);
+	INIT_LSN(region->ready_lsn);
+	INIT_LSN(region->t_lsn);
+
+	/*
+	 * It's possible to be waiting for an LSN of [1][0], if a replication
+	 * client gets the first log record out of order.  An LSN of [0][0]
+	 * signifies that we're not waiting.
+	 */
+	ZERO_LSN(region->waiting_lsn);
+
+	/*
+	 * Log makes note of the fact that it ran into a checkpoint on
+	 * startup if it did so, as a recovery optimization.  A zero
+	 * LSN signifies that it hasn't found one [yet].
+	 */
+	ZERO_LSN(region->cached_ckp_lsn);
+
+#ifdef  HAVE_MUTEX_SYSTEM_RESOURCES
+	/* Allocate room for the log maintenance info and initialize it. */
+	if ((ret = __db_shalloc(dblp->reginfo.addr,
+	    sizeof(REGMAINT) + LG_MAINT_SIZE, 0, &addr)) != 0)
+		goto mem_err;
+	__db_maintinit(&dblp->reginfo, addr, LG_MAINT_SIZE);
+	region->maint_off = R_OFFSET(&dblp->reginfo, addr);
+#endif
+
+	if ((ret = __db_mutex_setup(dbenv, &dblp->reginfo, &region->fq_mutex,
+	    MUTEX_NO_RLOCK)) != 0)
+		return (ret);
+
+	/*
+	 * We must create a place for the flush mutex separately; mutexes have
+	 * to be aligned to MUTEX_ALIGN, and the only way to guarantee that is
+	 * to make sure they're at the beginning of a shalloc'ed chunk.
+	 */
+	if ((ret = __db_shalloc(dblp->reginfo.addr,
+	    sizeof(DB_MUTEX), MUTEX_ALIGN, &flush_mutexp)) != 0)
+		goto mem_err;
+	if ((ret = __db_mutex_setup(dbenv, &dblp->reginfo, flush_mutexp,
+	    MUTEX_NO_RLOCK)) != 0)
+		return (ret);
+	region->flush_mutex_off = R_OFFSET(&dblp->reginfo, flush_mutexp);
 
 	/* Initialize the buffer. */
 	if ((ret =
@@ -159,9 +217,23 @@ mem_err:	__db_err(dbenv, "Unable to allocate memory for the log buffer");
 	}
 	region->buffer_size = dbenv->lg_bsize;
 	region->buffer_off = R_OFFSET(&dblp->reginfo, p);
+	region->log_size = region->log_nsize = dbenv->lg_size;
 
-	/* Try and recover any previous log files before releasing the lock. */
-	return (__log_recover(dblp));
+	/* Initialize the commit Queue. */
+	SH_TAILQ_INIT(&region->free_commits);
+	SH_TAILQ_INIT(&region->commits);
+	region->ncommit = 0;
+
+	/*
+	 * Fill in the log's persistent header.  Don't fill in the log file
+	 * sizes, as they may change at any time and so have to be filled in
+	 * as each log file is created.
+	 */
+	region->persist.magic = DB_LOGMAGIC;
+	region->persist.version = DB_LOGVERSION;
+	region->persist.mode = (u_int32_t)dbenv->db_mode;
+
+	return (0);
 }
 
 /*
@@ -173,12 +245,16 @@ __log_recover(dblp)
 	DB_LOG *dblp;
 {
 	DBT dbt;
+	DB_ENV *dbenv;
+	DB_LOGC *logc;
 	DB_LSN lsn;
 	LOG *lp;
-	int cnt, found_checkpoint, ret;
-	u_int32_t chk;
+	u_int32_t cnt, rectype;
+	int ret;
 	logfile_validity status;
 
+	logc = NULL;
+	dbenv = dblp->dbenv;
 	lp = dblp->reginfo.primary;
 
 	/*
@@ -192,8 +268,9 @@ __log_recover(dblp)
 
 	/*
 	 * If the last file is an old version, readable or no, start a new
-	 * file.  Don't bother finding checkpoints;  if we didn't take a
-	 * checkpoint right before upgrading, the user screwed up anyway.
+	 * file.  Don't bother finding the end of the last log file;
+	 * we assume that it's valid in its entirety, since the user
+	 * should have shut down cleanly or run recovery before upgrading.
 	 */
 	if (status == DB_LV_OLD_READABLE || status == DB_LV_OLD_UNREADABLE) {
 		lp->lsn.file = lp->s_lsn.file = cnt + 1;
@@ -213,25 +290,35 @@ __log_recover(dblp)
 	lsn.file = cnt;
 	lsn.offset = 0;
 
-	/* Set the cursor.  Shouldn't fail;  leave error messages on. */
-	memset(&dbt, 0, sizeof(dbt));
-	if ((ret = __log_get(dblp, &lsn, &dbt, DB_SET, 0)) != 0)
+	/*
+	 * Allocate a cursor and set it to the first record.  This shouldn't
+	 * fail, leave error messages on.
+	 */
+	if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0)
 		return (ret);
+	F_SET(logc, DB_LOG_LOCKED);
+	memset(&dbt, 0, sizeof(dbt));
+	if ((ret = logc->get(logc, &lsn, &dbt, DB_SET)) != 0)
+		goto err;
 
 	/*
-	 * Read to the end of the file, saving checkpoints.  This will fail
-	 * at some point, so turn off error messages.
+	 * Read to the end of the file.  This may fail at some point, so
+	 * turn off error messages.
 	 */
-	found_checkpoint = 0;
-	while (__log_get(dblp, &lsn, &dbt, DB_NEXT, 1) == 0) {
+	F_SET(logc, DB_LOG_SILENT_ERR);
+	while (logc->get(logc, &lsn, &dbt, DB_NEXT) == 0) {
 		if (dbt.size < sizeof(u_int32_t))
 			continue;
-		memcpy(&chk, dbt.data, sizeof(u_int32_t));
-		if (chk == DB_txn_ckp) {
-			lp->chkpt_lsn = lsn;
-			found_checkpoint = 1;
-		}
+		memcpy(&rectype, dbt.data, sizeof(u_int32_t));
+		if (rectype == DB___txn_ckp)
+			/*
+			 * If we happen to run into a checkpoint, cache its
+			 * LSN so that the transaction system doesn't have
+			 * to walk this log file again looking for it.
+			 */
+			lp->cached_ckp_lsn = lsn;
 	}
+	F_CLR(logc, DB_LOG_SILENT_ERR);
 
 	/*
 	 * We now know where the end of the log is.  Set the first LSN that
@@ -240,59 +327,24 @@ __log_recover(dblp)
 	 */
 	lp->lsn = lsn;
 	lp->s_lsn = lsn;
-	lp->lsn.offset += dblp->c_len;
-	lp->s_lsn.offset += dblp->c_len;
+	lp->lsn.offset += logc->c_len;
+	lp->s_lsn.offset += logc->c_len;
 
 	/* Set up the current buffer information, too. */
-	lp->len = dblp->c_len;
+	lp->len = logc->c_len;
 	lp->b_off = 0;
 	lp->w_off = lp->lsn.offset;
 
-	/*
-	 * It's possible that we didn't find a checkpoint because there wasn't
-	 * one in the last log file.  Start searching.
-	 */
-	if (!found_checkpoint && cnt > 1) {
-		lsn.file = cnt;
-		lsn.offset = 0;
-
-		/* Set the cursor.  Shouldn't fail, leave error messages on. */
-		if ((ret = __log_get(dblp, &lsn, &dbt, DB_SET, 0)) != 0)
-			return (ret);
-
-		/*
-		 * Read to the end of the file, saving checkpoints.  Again,
-		 * this can fail if there are no checkpoints in any log file,
-		 * so turn error messages off.
-		 */
-		while (__log_get(dblp, &lsn, &dbt, DB_PREV, 1) == 0) {
-			if (dbt.size < sizeof(u_int32_t))
-				continue;
-			memcpy(&chk, dbt.data, sizeof(u_int32_t));
-			if (chk == DB_txn_ckp) {
-				lp->chkpt_lsn = lsn;
-				found_checkpoint = 1;
-				break;
-			}
-		}
-	}
-
-	/* If we never find a checkpoint, that's okay, just 0 it out. */
-	if (!found_checkpoint)
-skipsearch:	ZERO_LSN(lp->chkpt_lsn);
-
-	/*
-	 * Reset the cursor lsn to the beginning of the log, so that an
-	 * initial call to DB_NEXT does the right thing.
-	 */
-	ZERO_LSN(dblp->c_lsn);
-
-	if (FLD_ISSET(dblp->dbenv->verbose, DB_VERB_RECOVERY))
-		__db_err(dblp->dbenv,
+skipsearch:
+	if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
+		__db_err(dbenv,
 		    "Finding last valid log LSN: file: %lu offset %lu",
 		    (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
 
-	return (0);
+err:	if (logc != NULL)
+		(void)logc->close(logc, 0);
+
+	return (ret);
 }
 
 /*
@@ -301,20 +353,23 @@ skipsearch:	ZERO_LSN(lp->chkpt_lsn);
  * the number of the first readable log file, else it will contain the number
  * of the last log file (which may be too old to read).
  *
- * PUBLIC: int __log_find __P((DB_LOG *, int, int *, logfile_validity *));
+ * PUBLIC: int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *));
  */
 int
 __log_find(dblp, find_first, valp, statusp)
 	DB_LOG *dblp;
-	int find_first, *valp;
+	int find_first;
+	u_int32_t *valp;
 	logfile_validity *statusp;
 {
+	DB_ENV *dbenv;
 	logfile_validity logval_status, status;
 	u_int32_t clv, logval;
 	int cnt, fcnt, ret;
 	const char *dir;
-	char **names, *p, *q, savech;
+	char *c, **names, *p, *q, savech;
 
+	dbenv = dblp->dbenv;
 	logval_status = status = DB_LV_NONEXISTENT;
 
 	/* Return a value of 0 as the log file number on failure. */
@@ -333,7 +388,7 @@ __log_find(dblp, find_first, valp, statusp)
 	}
 
 	/* Get the list of file names. */
-	ret = __os_dirlist(dblp->dbenv, dir, &names, &fcnt);
+	ret = __os_dirlist(dbenv, dir, &names, &fcnt);
 
 	/*
 	 * !!!
@@ -345,8 +400,8 @@ __log_find(dblp, find_first, valp, statusp)
 		*q = savech;
 
 	if (ret != 0) {
-		__db_err(dblp->dbenv, "%s: %s", dir, db_strerror(ret));
-		__os_freestr(p);
+		__db_err(dbenv, "%s: %s", dir, db_strerror(ret));
+		__os_free(dbenv, p);
 		return (ret);
 	}
 
@@ -356,74 +411,92 @@ __log_find(dblp, find_first, valp, statusp)
 			continue;
 
 		/*
+		 * Names of the form log\.[0-9]* are reserved for DB.  Other
+		 * names sharing LFPREFIX, such as "log.db", are legal.
+		 */
+		for (c = names[cnt] + sizeof(LFPREFIX) - 1; *c != '\0'; c++)
+			if (!isdigit((int)*c))
+				break;
+		if (*c != '\0')
+			continue;
+
+		/*
 		 * Use atol, not atoi; if an "int" is 16-bits, the largest
 		 * log file name won't fit.
 		 */
 		clv = atol(names[cnt] + (sizeof(LFPREFIX) - 1));
+
+		/*
+		 * If searching for the first log file, we want to return the
+		 * oldest log file we can read, or, if no readable log files
+		 * exist, the newest log file we can't read (the crossover
+		 * point between the old and new versions of the log file).
+		 *
+		 * If we're searching for the last log file, we want to return
+		 * the newest log file, period.
+		 *
+		 * Readable log files should never preceede unreadable log
+		 * files, that would mean the admin seriously screwed up.
+		 */
 		if (find_first) {
-			if (logval != 0 && clv > logval)
+			if (logval != 0 &&
+			    status != DB_LV_OLD_UNREADABLE && clv > logval)
 				continue;
 		} else
 			if (logval != 0 && clv < logval)
 				continue;
 
-		/*
-		 * Take note of whether the log file logval is
-		 * an old version or incompletely initialized.
-		 */
-		if ((ret = __log_valid(dblp, clv, 1, &status)) != 0)
+		if ((ret = __log_valid(dblp, clv, 1, &status)) != 0) {
+			__db_err(dbenv, "Invalid log file: %s: %s",
+			    names[cnt], db_strerror(ret));
 			goto err;
+		}
 		switch (status) {
+		case DB_LV_NONEXISTENT:
+			/* __log_valid never returns DB_LV_NONEXISTENT. */
+			DB_ASSERT(0);
+			break;
 		case DB_LV_INCOMPLETE:
 			/*
-			 * It's acceptable for the last log file to
-			 * have been incompletely initialized--it's possible
-			 * to create a log file but not write anything to it,
-			 * and recovery needs to gracefully handle this.
-			 *
-			 * Just ignore it;  we don't want to return this
-			 * as a valid log file.
+			 * The last log file may not have been initialized --
+			 * it's possible to create a log file but not write
+			 * anything to it.  If performing recovery (that is,
+			 * if find_first isn't set), ignore the file, it's
+			 * not interesting.  If we're searching for the first
+			 * log record, return the file (assuming we don't find
+			 * something better), as the "real" first log record
+			 * is likely to be in the log buffer, and we want to
+			 * set the file LSN for our return.
 			 */
+			if (find_first)
+				goto found;
 			break;
-		case DB_LV_NONEXISTENT:
-			/* Should never happen. */
-			DB_ASSERT(0);
+		case DB_LV_OLD_UNREADABLE:
+			/*
+			 * If we're searching for the first log file, then we
+			 * only want this file if we don't yet have a file or
+			 * already have an unreadable file and this one is
+			 * newer than that one.  If we're searching for the
+			 * last log file, we always want this file because we
+			 * wouldn't be here if it wasn't newer than our current
+			 * choice.
+			 */
+			if (!find_first || logval == 0 ||
+			    (status == DB_LV_OLD_UNREADABLE && clv > logval))
+				goto found;
 			break;
 		case DB_LV_NORMAL:
 		case DB_LV_OLD_READABLE:
-			logval = clv;
+found:			logval = clv;
 			logval_status = status;
 			break;
-		case DB_LV_OLD_UNREADABLE:
-			/*
-			 * Continue;  we want the oldest valid log,
-			 * and clv is too old to be useful.  We don't
-			 * want it to supplant logval if we're looking for
-			 * the oldest valid log, but we do want to return
-			 * it if it's the last log file--we want the very
-			 * last file number, so that our caller can
-			 * start a new file after it.
-			 *
-			 * The code here assumes that there will never
-			 * be a too-old log that's preceded by a log
-			 * of the current version, but in order to
-			 * attain that state of affairs the user
-			 * would have had to really seriously screw
-			 * up;  I think we can safely assume this won't
-			 * happen.
-			 */
-			if (!find_first) {
-				logval = clv;
-				logval_status = status;
-			}
-			break;
 		}
 	}
 
 	*valp = logval;
 
-err:	__os_dirfree(names, fcnt);
-	__os_freestr(p);
+err:	__os_dirfree(dbenv, names, fcnt);
+	__os_free(dbenv, p);
 	*statusp = logval_status;
 
 	return (ret);
@@ -446,30 +519,48 @@ __log_valid(dblp, number, set_persist, statusp)
 	int set_persist;
 	logfile_validity *statusp;
 {
+	DB_CIPHER *db_cipher;
+	DB_ENV *dbenv;
 	DB_FH fh;
+	HDR *hdr;
 	LOG *region;
-	LOGP persist;
-	char *fname;
-	int ret;
+	LOGP *persist;
 	logfile_validity status;
-	size_t nw;
+	size_t hdrsize, nw, recsize;
+	int is_hmac, need_free, ret;
+	u_int8_t *tmp;
+	char *fname;
 
+	dbenv = dblp->dbenv;
+	db_cipher = dbenv->crypto_handle;
+	persist = NULL;
 	status = DB_LV_NORMAL;
 
 	/* Try to open the log file. */
 	if ((ret = __log_name(dblp,
 	    number, &fname, &fh, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) {
-		__os_freestr(fname);
+		__os_free(dbenv, fname);
 		return (ret);
 	}
 
+	need_free = 0;
+	hdrsize = HDR_NORMAL_SZ;
+	is_hmac = 0;
+	recsize = sizeof(LOGP);
+	if (CRYPTO_ON(dbenv)) {
+		hdrsize = HDR_CRYPTO_SZ;
+		recsize = sizeof(LOGP);
+		recsize += db_cipher->adj_size(recsize);
+		is_hmac = 1;
+	}
+	if ((ret = __os_calloc(dbenv, 1, recsize + hdrsize, &tmp)) != 0)
+		return (ret);
+	need_free = 1;
+	hdr = (HDR *)tmp;
+	persist = (LOGP *)(tmp + hdrsize);
 	/* Try to read the header. */
-	if ((ret =
-	    __os_seek(dblp->dbenv,
-	    &fh, 0, 0, sizeof(HDR), 0, DB_OS_SEEK_SET)) != 0 ||
-	    (ret =
-	    __os_read(dblp->dbenv, &fh, &persist, sizeof(LOGP), &nw)) != 0 ||
-	    nw != sizeof(LOGP)) {
+	if ((ret = __os_read(dbenv, &fh, tmp, recsize + hdrsize, &nw)) != 0 ||
+	    nw != recsize + hdrsize) {
 		if (ret == 0)
 			status = DB_LV_INCOMPLETE;
 		else
@@ -477,19 +568,63 @@ __log_valid(dblp, number, set_persist, statusp)
 			 * The error was a fatal read error, not just an
 			 * incompletely initialized log file.
 			 */
-			__db_err(dblp->dbenv, "Ignoring log file: %s: %s",
+			__db_err(dbenv, "Ignoring log file: %s: %s",
 			    fname, db_strerror(ret));
 
-		(void)__os_closehandle(&fh);
+		(void)__os_closehandle(dbenv, &fh);
 		goto err;
 	}
-	(void)__os_closehandle(&fh);
+	(void)__os_closehandle(dbenv, &fh);
+
+	/*
+	 * Now we have to validate the persistent record.  We have
+	 * several scenarios we have to deal with:
+	 *
+	 * 1.  User has crypto turned on:
+	 *	- They're reading an old, unencrypted log file
+	 *	  .  We will fail the record size match check below.
+	 *	- They're reading a current, unencrypted log file
+	 *	  .  We will fail the record size match check below.
+	 *	- They're reading an old, encrypted log file [NOT YET]
+	 *	  .  After decryption we'll fail the version check.  [NOT YET]
+	 *	- They're reading a current, encrypted log file
+	 *	  .  We should proceed as usual.
+	 * 2.  User has crypto turned off:
+	 *	- They're reading an old, unencrypted log file
+	 *	  .  We will fail the version check.
+	 *	- They're reading a current, unencrypted log file
+	 *	  .  We should proceed as usual.
+	 *	- They're reading an old, encrypted log file [NOT YET]
+	 *	  .  We'll fail the magic number check (it is encrypted).
+	 *	- They're reading a current, encrypted log file
+	 *	  .  We'll fail the magic number check (it is encrypted).
+	 */
+	if (CRYPTO_ON(dbenv)) {
+		/*
+		 * If we are trying to decrypt an unencrypted log
+		 * we can only detect that by having an unreasonable
+		 * data length for our persistent data.
+		 */
+		if ((hdr->len - hdrsize) != sizeof(LOGP)) {
+			__db_err(dbenv, "log record size mismatch");
+			goto err;
+		}
+		/* Check the checksum and decrypt. */
+		if ((ret = __db_check_chksum(dbenv, db_cipher, &hdr->chksum[0],
+		    (u_int8_t *)persist, hdr->len - hdrsize, is_hmac)) != 0) {
+			__db_err(dbenv, "log record checksum mismatch");
+			goto err;
+		}
+		if ((ret = db_cipher->decrypt(dbenv, db_cipher->data,
+		    &hdr->iv[0], (u_int8_t *)persist, hdr->len - hdrsize)) != 0)
+			goto err;
+	}
 
 	/* Validate the header. */
-	if (persist.magic != DB_LOGMAGIC) {
-		__db_err(dblp->dbenv,
+	if (persist->magic != DB_LOGMAGIC) {
+		__db_err(dbenv,
 		    "Ignoring log file: %s: magic number %lx, not %lx",
-		    fname, (u_long)persist.magic, (u_long)DB_LOGMAGIC);
+		    fname, (u_long)persist->magic, (u_long)DB_LOGMAGIC);
 		ret = EINVAL;
 		goto err;
 	}
@@ -499,135 +634,162 @@ __log_valid(dblp, number, set_persist, statusp)
 	 * belongs to an unreadable or readable old version;  leave it
 	 * alone if and only if the log file version is the current one.
 	 */
-	if (persist.version > DB_LOGVERSION) {
+	if (persist->version > DB_LOGVERSION) {
 		/* This is a fatal error--the log file is newer than DB. */
-		__db_err(dblp->dbenv,
+		__db_err(dbenv,
 		    "Ignoring log file: %s: unsupported log version %lu",
-		    fname, (u_long)persist.version);
+		    fname, (u_long)persist->version);
 		ret = EINVAL;
 		goto err;
-	} else if (persist.version < DB_LOGOLDVER) {
+	} else if (persist->version < DB_LOGOLDVER) {
 		status = DB_LV_OLD_UNREADABLE;
 		/*
 		 * We don't want to set persistent info based on an
 		 * unreadable region, so jump to "err".
 		 */
 		goto err;
-	} else if (persist.version < DB_LOGVERSION)
+	} else if (persist->version < DB_LOGVERSION)
 		status = DB_LV_OLD_READABLE;
 
 	/*
-	 * If the log is thus far readable and we're doing system
-	 * initialization, set the region's persistent information
-	 * based on the headers.
+	 * Only if we have a current log do we verify the checksum.
+	 * We could not check the checksum before checking the magic
+	 * and version because old log hdrs have the length and checksum
+	 * in a different location.
+	 */
+	if (!CRYPTO_ON(dbenv) && ((ret = __db_check_chksum(dbenv,
+	    db_cipher, &hdr->chksum[0], (u_int8_t *)persist,
+	    hdr->len - hdrsize, is_hmac)) != 0)) {
+		__db_err(dbenv, "log record checksum mismatch");
+		goto err;
+	}
+
+	/*
+	 * If the log is readable so far and we're doing system initialization,
+	 * set the region's persistent information based on the headers.
+	 *
+	 * Always set the current log file size.  Only set the next log file's
+	 * size if the application hasn't set it already.
+	 *
+	 * XXX
+	 * Always use the persistent header's mode, regardless of what was set
+	 * in the current environment.  We've always done it this way, but it's
+	 * probably a bug -- I can't think of a way not-changing the mode would
+	 * be a problem, though.
 	 */
 	if (set_persist) {
 		region = dblp->reginfo.primary;
-		region->persist.lg_max = persist.lg_max;
-		region->persist.mode = persist.mode;
+		region->log_size = persist->log_size;
+		if (region->log_nsize == 0)
+			region->log_nsize = persist->log_size;
+		region->persist.mode = persist->mode;
 	}
 
-err:	__os_freestr(fname);
+err:	__os_free(dbenv, fname);
+	if (need_free)
+		__os_free(dbenv, tmp);
 	*statusp = status;
 	return (ret);
 }
 
 /*
- * __log_close --
- *	Internal version of log_close: only called from dbenv_refresh.
+ * __log_dbenv_refresh --
+ *	Clean up after the log system on a close or failed open.  Called only
+ * from __dbenv_refresh.  (Formerly called __log_close.)
  *
- * PUBLIC: int __log_close __P((DB_ENV *));
+ * PUBLIC: int __log_dbenv_refresh __P((DB_ENV *));
  */
 int
-__log_close(dbenv)
+__log_dbenv_refresh(dbenv)
 	DB_ENV *dbenv;
 {
 	DB_LOG *dblp;
 	int ret, t_ret;
 
-	ret = 0;
 	dblp = dbenv->lg_handle;
 
 	/* We may have opened files as part of XA; if so, close them. */
 	F_SET(dblp, DBLOG_RECOVER);
-	__log_close_files(dbenv);
+	ret = __dbreg_close_files(dbenv);
 
 	/* Discard the per-thread lock. */
 	if (dblp->mutexp != NULL)
 		__db_mutex_free(dbenv, &dblp->reginfo, dblp->mutexp);
 
 	/* Detach from the region. */
-	ret = __db_r_detach(dbenv, &dblp->reginfo, 0);
+	if ((t_ret =
+	    __db_r_detach(dbenv, &dblp->reginfo, 0)) != 0 && ret == 0)
+		ret = t_ret;
 
 	/* Close open files, release allocated memory. */
 	if (F_ISSET(&dblp->lfh, DB_FH_VALID) &&
-	    (t_ret = __os_closehandle(&dblp->lfh)) != 0 && ret == 0)
-		ret = t_ret;
-	if (dblp->c_dbt.data != NULL)
-		__os_free(dblp->c_dbt.data, dblp->c_dbt.ulen);
-	if (F_ISSET(&dblp->c_fh, DB_FH_VALID) &&
-	    (t_ret = __os_closehandle(&dblp->c_fh)) != 0 && ret == 0)
+	    (t_ret = __os_closehandle(dbenv, &dblp->lfh)) != 0 && ret == 0)
 		ret = t_ret;
 	if (dblp->dbentry != NULL)
-		__os_free(dblp->dbentry,
-		    (dblp->dbentry_cnt * sizeof(DB_ENTRY)));
-	if (dblp->readbufp != NULL)
-		__os_free(dblp->readbufp, dbenv->lg_bsize);
+		__os_free(dbenv, dblp->dbentry);
 
-	__os_free(dblp, sizeof(*dblp));
+	__os_free(dbenv, dblp);
 
 	dbenv->lg_handle = NULL;
 	return (ret);
 }
 
 /*
- * log_stat --
- *	Return LOG statistics.
+ * __log_stat --
+ *	Return log statistics.
+ *
+ * PUBLIC: int __log_stat __P((DB_ENV *, DB_LOG_STAT **, u_int32_t));
  */
 int
-log_stat(dbenv, statp, db_malloc)
+__log_stat(dbenv, statp, flags)
 	DB_ENV *dbenv;
 	DB_LOG_STAT **statp;
-	void *(*db_malloc) __P((size_t));
+	u_int32_t flags;
 {
 	DB_LOG *dblp;
 	DB_LOG_STAT *stats;
 	LOG *region;
 	int ret;
 
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_log_stat(dbenv, statp, db_malloc));
-#endif
-
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->lg_handle, "DB_ENV->log_stat", DB_INIT_LOG);
 
 	*statp = NULL;
+	if ((ret = __db_fchk(dbenv,
+	    "DB_ENV->log_stat", flags, DB_STAT_CLEAR)) != 0)
+		return (ret);
 
 	dblp = dbenv->lg_handle;
 	region = dblp->reginfo.primary;
 
-	if ((ret = __os_malloc(dbenv,
-	    sizeof(DB_LOG_STAT), db_malloc, &stats)) != 0)
+	if ((ret = __os_umalloc(dbenv, sizeof(DB_LOG_STAT), &stats)) != 0)
 		return (ret);
 
 	/* Copy out the global statistics. */
 	R_LOCK(dbenv, &dblp->reginfo);
 	*stats = region->stat;
+	if (LF_ISSET(DB_STAT_CLEAR))
+		memset(&region->stat, 0, sizeof(region->stat));
 
 	stats->st_magic = region->persist.magic;
 	stats->st_version = region->persist.version;
 	stats->st_mode = region->persist.mode;
 	stats->st_lg_bsize = region->buffer_size;
-	stats->st_lg_max = region->persist.lg_max;
+	stats->st_lg_size = region->log_nsize;
 
 	stats->st_region_wait = dblp->reginfo.rp->mutex.mutex_set_wait;
 	stats->st_region_nowait = dblp->reginfo.rp->mutex.mutex_set_nowait;
+	if (LF_ISSET(DB_STAT_CLEAR)) {
+		dblp->reginfo.rp->mutex.mutex_set_wait = 0;
+		dblp->reginfo.rp->mutex.mutex_set_nowait = 0;
+	}
 	stats->st_regsize = dblp->reginfo.rp->size;
 
 	stats->st_cur_file = region->lsn.file;
 	stats->st_cur_offset = region->lsn.offset;
+	stats->st_disk_file = region->s_lsn.file;
+	stats->st_disk_offset = region->s_lsn.offset;
 
 	R_UNLOCK(dbenv, &dblp->reginfo);
 
@@ -636,22 +798,287 @@ log_stat(dbenv, statp, db_malloc)
 }
 
 /*
- * __log_lastckp --
- *	Return the current chkpt_lsn, so that we can store it in
- *	the transaction region and keep the chain of checkpoints
- *	unbroken across environment recreates.
+ * __log_get_cached_ckp_lsn --
+ *	Retrieve any last checkpoint LSN that we may have found on startup.
+ *
+ * PUBLIC: void __log_get_cached_ckp_lsn __P((DB_ENV *, DB_LSN *));
+ */
+void
+__log_get_cached_ckp_lsn(dbenv, ckp_lsnp)
+	DB_ENV *dbenv;
+	DB_LSN *ckp_lsnp;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+
+	dblp = (DB_LOG *)dbenv->lg_handle;
+	lp = (LOG *)dblp->reginfo.primary;
+
+	R_LOCK(dbenv, &dblp->reginfo);
+	*ckp_lsnp = lp->cached_ckp_lsn;
+	R_UNLOCK(dbenv, &dblp->reginfo);
+}
+
+/*
+ * __log_region_size --
+ *	Return the amount of space needed for the log region.
+ *	Make the region large enough to hold txn_max transaction
+ *	detail structures  plus some space to hold thread handles
+ *	and the beginning of the shalloc region and anything we
+ *	need for mutex system resource recording.
+ */
+static size_t
+__log_region_size(dbenv)
+	DB_ENV *dbenv;
+{
+	size_t s;
+
+	s = dbenv->lg_regionmax + dbenv->lg_bsize;
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
+	if (F_ISSET(dbenv, DB_ENV_THREAD))
+		s += sizeof(REGMAINT) + LG_MAINT_SIZE;
+#endif
+	return (s);
+}
+
+/*
+ * __log_region_destroy
+ *	Destroy any region maintenance info.
+ *
+ * PUBLIC: void __log_region_destroy __P((DB_ENV *, REGINFO *));
+ */
+void
+__log_region_destroy(dbenv, infop)
+	DB_ENV *dbenv;
+	REGINFO *infop;
+{
+	__db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop,
+	    ((LOG *)R_ADDR(infop, infop->rp->primary))->maint_off));
+
+	COMPQUIET(dbenv, NULL);
+	COMPQUIET(infop, NULL);
+}
+
+/*
+ * __log_vtruncate
+ *	This is a virtual truncate.  We set up the log indicators to
+ * make everyone believe that the given record is the last one in the
+ * log.  Returns with the next valid LSN (i.e., the LSN of the next
+ * record to be written). This is used in replication to discard records
+ * in the log file that do not agree with the master.
+ *
+ * PUBLIC: int __log_vtruncate __P((DB_ENV *, DB_LSN *, DB_LSN *));
+ */
+int
+__log_vtruncate(dbenv, lsn, ckplsn)
+	DB_ENV *dbenv;
+	DB_LSN *lsn, *ckplsn;
+{
+	DBT log_dbt;
+	DB_FH fh;
+	DB_LOG *dblp;
+	DB_LOGC *logc;
+	DB_LSN end_lsn;
+	LOG *lp;
+	u_int32_t bytes, c_len;
+	int fn, ret, t_ret;
+	char *fname;
+
+	/* Need to find out the length of this soon-to-be-last record. */
+	if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0)
+		return (ret);
+	memset(&log_dbt, 0, sizeof(log_dbt));
+	ret = logc->get(logc, lsn, &log_dbt, DB_SET);
+	c_len = logc->c_len;
+	if ((t_ret = logc->close(logc, 0)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		return (ret);
+
+	/* Now do the truncate. */
+	dblp = (DB_LOG *)dbenv->lg_handle;
+	lp = (LOG *)dblp->reginfo.primary;
+
+	R_LOCK(dbenv, &dblp->reginfo);
+	end_lsn = lp->lsn;
+	lp->lsn = *lsn;
+	lp->len = c_len;
+	lp->lsn.offset += lp->len;
+
+	/*
+	 * I am going to assume that the number of bytes written since
+	 * the last checkpoint doesn't exceed a 32-bit number.
+	 */
+	DB_ASSERT(lp->lsn.file >= ckplsn->file);
+	bytes = 0;
+	if (ckplsn->file != lp->lsn.file) {
+		bytes = lp->log_size - ckplsn->offset;
+		if (lp->lsn.file > ckplsn->file + 1)
+			bytes += lp->log_size *
+			    (lp->lsn.file - ckplsn->file - 1);
+		bytes += lp->lsn.offset;
+	} else
+		bytes = lp->lsn.offset - ckplsn->offset;
+
+	lp->stat.st_wc_mbytes += bytes / MEGABYTE;
+	lp->stat.st_wc_bytes += bytes % MEGABYTE;
+
+	/*
+	 * If the saved lsn is greater than our new end of log, reset it
+	 * to our current end of log.
+	 */
+	if (log_compare(&lp->s_lsn, lsn) > 0)
+		lp->s_lsn = lp->lsn;
+
+	/*
+	 * If the new end of log is in the middle of the buffer,
+	 * don't change the w_off or f_lsn.  If the new end is
+	 * before the w_off then reset w_off and f_lsn to the new
+	 * end of log.
+	 */
+	if (lp->w_off >= lp->lsn.offset) {
+		lp->f_lsn = lp->lsn;
+		lp->w_off = lp->lsn.offset;
+		lp->b_off = 0;
+	} else
+		lp->b_off = lp->lsn.offset - lp->w_off;
+
+	ZERO_LSN(lp->waiting_lsn);
+	lp->ready_lsn = lp->lsn;
+	lp->wait_recs = 0;
+	lp->rcvd_recs = 0;
+
+	/* Now throw away any extra log files that we have around. */
+	for (fn = lp->lsn.file + 1;; fn++) {
+		if (__log_name(dblp, fn, &fname, &fh, DB_OSO_RDONLY) != 0) {
+			__os_free(dbenv, fname);
+			break;
+		}
+		(void)__os_closehandle(dbenv, &fh);
+		ret = __os_unlink(dbenv, fname);
+		__os_free(dbenv, fname);
+		if (ret != 0)
+			goto err;
+	}
+
+	/* Truncate the log to the new point. */
+	if ((ret = __log_zero(dbenv, &lp->lsn, &end_lsn)) != 0)
+		goto err;
+
+err:	R_UNLOCK(dbenv, &dblp->reginfo);
+	return (ret);
+}
+
+/*
+ * __log_is_outdated --
+ *	Used by the replication system to identify if a client's logs
+ * are too old.  The log represented by dbenv is compared to the file
+ * number passed in fnum.  If the log file fnum does not exist and is
+ * lower-numbered than the current logs, the we return *outdatedp non
+ * zero, else we return it 0.
  *
- * PUBLIC: int __log_lastckp __P((DB_ENV *, DB_LSN *));
+ * PUBLIC: int __log_is_outdated __P((DB_ENV *dbenv,
+ * PUBLIC:     u_int32_t fnum, int *outdatedp));
  */
 int
-__log_lastckp(dbenv, lsnp)
+__log_is_outdated(dbenv, fnum, outdatedp)
 	DB_ENV *dbenv;
-	DB_LSN *lsnp;
+	u_int32_t fnum;
+	int *outdatedp;
 {
+	DB_LOG *dblp;
 	LOG *lp;
+	char *name;
+	int ret;
+	u_int32_t cfile;
 
-	lp = (LOG *)(((DB_LOG *)dbenv->lg_handle)->reginfo.primary);
+	dblp = dbenv->lg_handle;
+	*outdatedp = 0;
+
+	if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0)
+		return (ret);
+
+	/* If the file exists, we're just fine. */
+	if (__os_exists(name, NULL) == 0)
+		goto out;
+
+	/*
+	 * It didn't exist, decide if the file number is too big or
+	 * too little.  If it's too little, then we need to indicate
+	 * that the LSN is outdated.
+	 */
+	R_LOCK(dbenv, &dblp->reginfo);
+	lp = (LOG *)dblp->reginfo.primary;
+	cfile = lp->lsn.file;
+	R_UNLOCK(dbenv, &dblp->reginfo);
+
+	if (cfile > fnum)
+		*outdatedp = 1;
+out:	__os_free(dbenv, name);
+	return (ret);
+}
+
+/*
+ * __log_zero --
+ *	Zero out the tail of a log after a truncate.
+ */
+static int
+__log_zero(dbenv, from_lsn, to_lsn)
+	DB_ENV *dbenv;
+	DB_LSN *from_lsn, *to_lsn;
+{
+	char *lname;
+	DB_LOG *dblp;
+	LOG *lp;
+	int ret;
+	size_t nbytes, len, nw;
+	u_int8_t buf[4096];
+	u_int32_t mbytes, bytes;
+
+	dblp = dbenv->lg_handle;
+	lp = (LOG *)dblp->reginfo.primary;
+	lname = NULL;
+
+	if (dblp->lfname != lp->lsn.file) {
+		if (F_ISSET(&dblp->lfh, DB_FH_VALID))
+			(void)__os_closehandle(dbenv, &dblp->lfh);
+		dblp->lfname = lp->lsn.file;
+	}
+
+	if (from_lsn->file != to_lsn->file) {
+		/* We removed some log files; have to 0 to end of file. */
+		if (!F_ISSET(&dblp->lfh, DB_FH_VALID) && (ret =
+		    __log_name(dblp, dblp->lfname, &lname, &dblp->lfh, 0)) != 0)
+			return (ret);
+		if ((ret = __os_ioinfo(dbenv,
+		    NULL, &dblp->lfh, &mbytes, &bytes, NULL)) != 0)
+			goto err;
+		len = mbytes * MEGABYTE + bytes - from_lsn->offset;
+	} else if (to_lsn->offset <= from_lsn->offset)
+		return (0);
+	else
+		len = to_lsn->offset = from_lsn->offset;
+
+	memset(buf, 0, sizeof(buf));
+
+	/* Initialize the write position. */
+	if (!F_ISSET(&dblp->lfh, DB_FH_VALID) &&
+	    (ret = __log_name(dblp, dblp->lfname, &lname, &dblp->lfh, 0)) != 0)
+		goto err;
+
+	if ((ret = __os_seek(dbenv,
+	    &dblp->lfh, 0, 0, from_lsn->offset, 0, DB_OS_SEEK_SET)) != 0)
+		return (ret);
+
+	while (len > 0) {
+		nbytes = len > sizeof(buf) ? sizeof(buf) : len;
+		if ((ret =
+		    __os_write(dbenv, &dblp->lfh, buf, nbytes, &nw)) != 0)
+			return (ret);
+		len -= nbytes;
+	}
+err:	if (lname != NULL)
+		__os_free(dbenv, lname);
 
-	*lsnp = lp->chkpt_lsn;
 	return (0);
 }
diff --git a/bdb/log/log.src b/bdb/log/log.src
deleted file mode 100644
index a92fae8de26..00000000000
--- a/bdb/log/log.src
+++ /dev/null
@@ -1,46 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
- *	Sleepycat Software.  All rights reserved.
- *
- *	$Id: log.src,v 10.12 2000/02/17 20:24:10 bostic Exp $
- */
-
-PREFIX	log
-
-INCLUDE	#include "db_config.h"
-INCLUDE
-INCLUDE #ifndef NO_SYSTEM_INCLUDES
-INCLUDE #include <sys/types.h>
-INCLUDE
-INCLUDE #include <ctype.h>
-INCLUDE #include <errno.h>
-INCLUDE #include <string.h>
-INCLUDE #endif
-INCLUDE
-INCLUDE #include "db_int.h"
-INCLUDE #include "db_page.h"
-INCLUDE #include "db_dispatch.h"
-INCLUDE #include "db_am.h"
-INCLUDE #include "log.h"
-INCLUDE #include "txn.h"
-INCLUDE
-
-/* Used for registering name/id translations at open or close. */
-DEPRECATED register1	1
-ARG	opcode		u_int32_t	lu
-DBT	name		DBT		s
-DBT	uid		DBT		s
-ARG	fileid		int32_t		ld
-ARG	ftype		DBTYPE		lx
-END
-
-BEGIN register		2
-ARG	opcode		u_int32_t	lu
-DBT	name		DBT		s
-DBT	uid		DBT		s
-ARG	fileid		int32_t		ld
-ARG	ftype		DBTYPE		lx
-ARG	meta_pgno	db_pgno_t	lu
-END
diff --git a/bdb/log/log_archive.c b/bdb/log/log_archive.c
index 83728c79e55..19e1af5a93e 100644
--- a/bdb/log/log_archive.c
+++ b/bdb/log/log_archive.c
@@ -1,14 +1,14 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1997, 1998, 1999, 2000
+ * Copyright (c) 1997-2002
  *	Sleepycat Software.  All rights reserved.
  */
 
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: log_archive.c,v 11.13 2000/11/30 00:58:40 ubell Exp $";
+static const char revid[] = "$Id: log_archive.c,v 11.39 2002/08/06 05:00:31 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -19,50 +19,41 @@ static const char revid[] = "$Id: log_archive.c,v 11.13 2000/11/30 00:58:40 ubel
 #include <unistd.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_dispatch.h"
-#include "log.h"
-#include "clib_ext.h"			/* XXX: needed for getcwd. */
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_page.h"
+#include "dbinc/log.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
 
 static int __absname __P((DB_ENV *, char *, char *, char **));
-static int __build_data __P((DB_ENV *, char *, char ***, void *(*)(size_t)));
+static int __build_data __P((DB_ENV *, char *, char ***));
 static int __cmpfunc __P((const void *, const void *));
-static int __usermem __P((DB_ENV *, char ***, void *(*)(size_t)));
+static int __usermem __P((DB_ENV *, char ***));
 
 /*
- * log_archive --
+ * __log_archive --
  *	Supporting function for db_archive(1).
+ *
+ * PUBLIC: int __log_archive __P((DB_ENV *, char **[], u_int32_t));
  */
 int
-log_archive(dbenv, listp, flags, db_malloc)
+__log_archive(dbenv, listp, flags)
 	DB_ENV *dbenv;
 	char ***listp;
 	u_int32_t flags;
-	void *(*db_malloc) __P((size_t));
 {
 	DBT rec;
 	DB_LOG *dblp;
+	DB_LOGC *logc;
 	DB_LSN stable_lsn;
-	u_int32_t fnum;
-	int array_size, n, ret;
+	__txn_ckp_args *ckp_args;
 	char **array, **arrayp, *name, *p, *pref, buf[MAXPATHLEN];
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_log_archive(dbenv, listp, flags, db_malloc));
-#endif
+	int array_size, db_arch_abs, n, ret;
+	u_int32_t fnum;
 
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->lg_handle, "DB_ENV->log_archive", DB_INIT_LOG);
 
 	name = NULL;
 	dblp = dbenv->lg_handle;
@@ -70,15 +61,24 @@ log_archive(dbenv, listp, flags, db_malloc)
 
 #define	OKFLAGS	(DB_ARCH_ABS | DB_ARCH_DATA | DB_ARCH_LOG)
 	if (flags != 0) {
-		if ((ret =
-		    __db_fchk(dbenv, "log_archive", flags, OKFLAGS)) != 0)
+		if ((ret = __db_fchk(
+		    dbenv, "DB_ENV->log_archive", flags, OKFLAGS)) != 0)
 			return (ret);
-		if ((ret =
-		    __db_fcchk(dbenv,
-			"log_archive", flags, DB_ARCH_DATA, DB_ARCH_LOG)) != 0)
+		if ((ret = __db_fcchk(dbenv, "DB_ENV->log_archive",
+		    flags, DB_ARCH_DATA, DB_ARCH_LOG)) != 0)
 			return (ret);
 	}
 
+	if (LF_ISSET(DB_ARCH_ABS)) {
+		db_arch_abs = 1;
+		LF_CLR(DB_ARCH_ABS);
+	} else
+		db_arch_abs = 0;
+
+	if (flags == 0 || flags == DB_ARCH_DATA)
+		ENV_REQUIRES_CONFIG(dbenv,
+		    dbenv->tx_handle, "DB_ENV->log_archive", DB_INIT_TXN);
+
 	/*
 	 * Get the absolute pathname of the current directory.  It would
 	 * be nice to get the shortest pathname of the database directory,
@@ -88,7 +88,7 @@ log_archive(dbenv, listp, flags, db_malloc)
 	 * Can't trust getcwd(3) to set a valid errno.  If it doesn't, just
 	 * guess that we ran out of memory.
 	 */
-	if (LF_ISSET(DB_ARCH_ABS)) {
+	if (db_arch_abs) {
 		__os_set_errno(0);
 		if ((pref = getcwd(buf, sizeof(buf))) == NULL) {
 			if (__os_get_errno() == 0)
@@ -98,31 +98,55 @@ log_archive(dbenv, listp, flags, db_malloc)
 	} else
 		pref = NULL;
 
-	switch (LF_ISSET(~DB_ARCH_ABS)) {
+	switch (flags) {
 	case DB_ARCH_DATA:
-		return (__build_data(dbenv, pref, listp, db_malloc));
+		return (__build_data(dbenv, pref, listp));
 	case DB_ARCH_LOG:
 		memset(&rec, 0, sizeof(rec));
-		if (F_ISSET(dbenv, DB_ENV_THREAD))
-			F_SET(&rec, DB_DBT_MALLOC);
-		if ((ret = log_get(dbenv, &stable_lsn, &rec, DB_LAST)) != 0)
+		if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0)
+			return (ret);
+#ifdef UMRW
+		ZERO_LSN(stable_lsn);
+#endif
+		ret = logc->get(logc, &stable_lsn, &rec, DB_LAST);
+		(void)logc->close(logc, 0);
+		if (ret != 0)
 			return (ret);
-		if (F_ISSET(dbenv, DB_ENV_THREAD))
-			__os_free(rec.data, rec.size);
 		fnum = stable_lsn.file;
 		break;
 	case 0:
-		if ((ret = __log_findckp(dbenv, &stable_lsn)) != 0) {
+		memset(&rec, 0, sizeof(rec));
+		if (__txn_getckp(dbenv, &stable_lsn) != 0) {
 			/*
-			 * A return of DB_NOTFOUND means that we didn't find
-			 * any records in the log (so we are not going to be
-			 * deleting any log files).
+			 * A failure return means that there's no checkpoint
+			 * in the log (so we are not going to be deleting
+			 * any log files).
 			 */
-			if (ret != DB_NOTFOUND)
-				return (ret);
 			*listp = NULL;
 			return (0);
 		}
+		if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0)
+			return (ret);
+		if ((ret = logc->get(logc, &stable_lsn, &rec, DB_SET)) != 0 ||
+		    (ret = __txn_ckp_read(dbenv, rec.data, &ckp_args)) != 0) {
+			/*
+			 * A return of DB_NOTFOUND may only mean that the
+			 * checkpoint LSN is before the beginning of the
+			 * log files that we still have.  This is not
+			 * an error;  it just means our work is done.
+			 */
+			if (ret == DB_NOTFOUND) {
+				*listp = NULL;
+				ret = 0;
+			}
+			(void)logc->close(logc, 0);
+			return (ret);
+		}
+		if ((ret = logc->close(logc, 0)) != 0)
+			return (ret);
+		stable_lsn = ckp_args->ckp_lsn;
+		__os_free(dbenv, ckp_args);
+
 		/* Remove any log files before the last stable LSN. */
 		fnum = stable_lsn.file - 1;
 		break;
@@ -130,9 +154,9 @@ log_archive(dbenv, listp, flags, db_malloc)
 
 #define	LIST_INCREMENT	64
 	/* Get some initial space. */
-	array_size = 10;
+	array_size = 64;
 	if ((ret = __os_malloc(dbenv,
-	    sizeof(char *) * array_size, NULL, &array)) != 0)
+	    sizeof(char *) * array_size, &array)) != 0)
 		return (ret);
 	array[0] = NULL;
 
@@ -143,27 +167,27 @@ log_archive(dbenv, listp, flags, db_malloc)
 		if (__os_exists(name, NULL) != 0) {
 			if (LF_ISSET(DB_ARCH_LOG) && fnum == stable_lsn.file)
 				continue;
-			__os_freestr(name);
+			__os_free(dbenv, name);
 			name = NULL;
 			break;
 		}
 
-		if (n >= array_size - 1) {
+		if (n >= array_size - 2) {
 			array_size += LIST_INCREMENT;
 			if ((ret = __os_realloc(dbenv,
-			    sizeof(char *) * array_size, NULL, &array)) != 0)
+			    sizeof(char *) * array_size, &array)) != 0)
 				goto err;
 		}
 
-		if (LF_ISSET(DB_ARCH_ABS)) {
+		if (db_arch_abs) {
 			if ((ret = __absname(dbenv,
 			    pref, name, &array[n])) != 0)
 				goto err;
-			__os_freestr(name);
+			__os_free(dbenv, name);
 		} else if ((p = __db_rpath(name)) != NULL) {
 			if ((ret = __os_strdup(dbenv, p + 1, &array[n])) != 0)
 				goto err;
-			__os_freestr(name);
+			__os_free(dbenv, name);
 		} else
 			array[n] = name;
 
@@ -182,7 +206,7 @@ log_archive(dbenv, listp, flags, db_malloc)
 	qsort(array, (size_t)n, sizeof(char *), __cmpfunc);
 
 	/* Rework the memory. */
-	if ((ret = __usermem(dbenv, &array, db_malloc)) != 0)
+	if ((ret = __usermem(dbenv, &array)) != 0)
 		goto err;
 
 	*listp = array;
@@ -190,11 +214,11 @@ log_archive(dbenv, listp, flags, db_malloc)
 
 err:	if (array != NULL) {
 		for (arrayp = array; *arrayp != NULL; ++arrayp)
-			__os_freestr(*arrayp);
-		__os_free(array, sizeof(char *) * array_size);
+			__os_free(dbenv, *arrayp);
+		__os_free(dbenv, array);
 	}
 	if (name != NULL)
-		__os_freestr(name);
+		__os_free(dbenv, name);
 	return (ret);
 }
 
@@ -203,73 +227,89 @@ err:	if (array != NULL) {
  *	Build a list of datafiles for return.
  */
 static int
-__build_data(dbenv, pref, listp, db_malloc)
+__build_data(dbenv, pref, listp)
 	DB_ENV *dbenv;
 	char *pref, ***listp;
-	void *(*db_malloc) __P((size_t));
 {
 	DBT rec;
+	DB_LOGC *logc;
 	DB_LSN lsn;
-	__log_register_args *argp;
+	__dbreg_register_args *argp;
 	u_int32_t rectype;
-	int array_size, last, n, nxt, ret;
-	char **array, **arrayp, *p, *real_name;
+	int array_size, last, n, nxt, ret, t_ret;
+	char **array, **arrayp, **list, **lp, *p, *real_name;
 
 	/* Get some initial space. */
-	array_size = 10;
+	array_size = 64;
 	if ((ret = __os_malloc(dbenv,
-	    sizeof(char *) * array_size, NULL, &array)) != 0)
+	    sizeof(char *) * array_size, &array)) != 0)
 		return (ret);
 	array[0] = NULL;
 
 	memset(&rec, 0, sizeof(rec));
-	if (F_ISSET(dbenv, DB_ENV_THREAD))
-		F_SET(&rec, DB_DBT_MALLOC);
-	for (n = 0, ret = log_get(dbenv, &lsn, &rec, DB_FIRST);
-	    ret == 0; ret = log_get(dbenv, &lsn, &rec, DB_NEXT)) {
+	if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0)
+		return (ret);
+	for (n = 0; (ret = logc->get(logc, &lsn, &rec, DB_PREV)) == 0;) {
 		if (rec.size < sizeof(rectype)) {
 			ret = EINVAL;
-			__db_err(dbenv, "log_archive: bad log record");
-			goto lg_free;
+			__db_err(dbenv, "DB_ENV->log_archive: bad log record");
+			goto free_continue;
 		}
 
 		memcpy(&rectype, rec.data, sizeof(rectype));
-		if (rectype != DB_log_register) {
-			if (F_ISSET(dbenv, DB_ENV_THREAD)) {
-				__os_free(rec.data, rec.size);
-				rec.data = NULL;
-			}
+		if (rectype != DB___dbreg_register)
 			continue;
-		}
-		if ((ret = __log_register_read(dbenv, rec.data, &argp)) != 0) {
+		if ((ret =
+		    __dbreg_register_read(dbenv, rec.data, &argp)) != 0) {
 			ret = EINVAL;
 			__db_err(dbenv,
-			    "log_archive: unable to read log record");
-			goto lg_free;
+			    "DB_ENV->log_archive: unable to read log record");
+			goto free_continue;
 		}
 
-		if (n >= array_size - 1) {
+		if (n >= array_size - 2) {
 			array_size += LIST_INCREMENT;
 			if ((ret = __os_realloc(dbenv,
-			    sizeof(char *) * array_size, NULL, &array)) != 0)
-				goto lg_free;
+			    sizeof(char *) * array_size, &array)) != 0)
+				goto free_continue;
 		}
 
 		if ((ret = __os_strdup(dbenv,
-		    argp->name.data, &array[n])) != 0) {
-lg_free:		if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL)
-				__os_free(rec.data, rec.size);
-			goto err1;
-		}
-
-		array[++n] = NULL;
-		__os_free(argp, 0);
-
-		if (F_ISSET(dbenv, DB_ENV_THREAD)) {
-			__os_free(rec.data, rec.size);
-			rec.data = NULL;
+		    argp->name.data, &array[n++])) != 0)
+			goto free_continue;
+		array[n] = NULL;
+
+		if (argp->ftype == DB_QUEUE) {
+			if ((ret = __qam_extent_names(dbenv,
+			    argp->name.data, &list)) != 0)
+				goto q_err;
+			for (lp = list;
+			    lp != NULL && *lp != NULL; lp++) {
+				if (n >= array_size - 2) {
+					array_size += LIST_INCREMENT;
+					if ((ret = __os_realloc(dbenv,
+					    sizeof(char *) *
+					    array_size, &array)) != 0)
+						goto q_err;
+				}
+				if ((ret =
+				    __os_strdup(dbenv, *lp, &array[n++])) != 0)
+					goto q_err;
+				array[n] = NULL;
+			}
+q_err:			if (list != NULL)
+				__os_free(dbenv, list);
 		}
+free_continue:	__os_free(dbenv, argp);
+		if (ret != 0)
+			break;
 	}
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+	if ((t_ret = logc->close(logc, 0)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		goto err1;
 
 	/* If there's nothing to return, we're done. */
 	if (n == 0) {
@@ -297,34 +337,34 @@ lg_free:		if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL)
 		}
 		for (++nxt; nxt < n &&
 		    strcmp(array[last], array[nxt]) == 0; ++nxt) {
-			__os_freestr(array[nxt]);
+			__os_free(dbenv, array[nxt]);
 			array[nxt] = NULL;
 		}
 
 		/* Get the real name. */
 		if ((ret = __db_appname(dbenv,
-		    DB_APP_DATA, NULL, array[last], 0, NULL, &real_name)) != 0)
+		    DB_APP_DATA, array[last], 0, NULL, &real_name)) != 0)
 			goto err2;
 
 		/* If the file doesn't exist, ignore it. */
 		if (__os_exists(real_name, NULL) != 0) {
-			__os_freestr(real_name);
-			__os_freestr(array[last]);
+			__os_free(dbenv, real_name);
+			__os_free(dbenv, array[last]);
 			array[last] = NULL;
 			continue;
 		}
 
 		/* Rework the name as requested by the user. */
-		__os_freestr(array[last]);
+		__os_free(dbenv, array[last]);
 		array[last] = NULL;
 		if (pref != NULL) {
 			ret = __absname(dbenv, pref, real_name, &array[last]);
-			__os_freestr(real_name);
+			__os_free(dbenv, real_name);
 			if (ret != 0)
 				goto err2;
 		} else if ((p = __db_rpath(real_name)) != NULL) {
 			ret = __os_strdup(dbenv, p + 1, &array[last]);
-			__os_freestr(real_name);
+			__os_free(dbenv, real_name);
 			if (ret != 0)
 				goto err2;
 		} else
@@ -336,7 +376,7 @@ lg_free:		if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL)
 	array[last] = NULL;
 
 	/* Rework the memory. */
-	if ((ret = __usermem(dbenv, &array, db_malloc)) != 0)
+	if ((ret = __usermem(dbenv, &array)) != 0)
 		goto err1;
 
 	*listp = array;
@@ -349,13 +389,13 @@ err2:	/*
 	 */
 	if (array != NULL)
 		for (; nxt < n; ++nxt)
-			__os_freestr(array[nxt]);
+			__os_free(dbenv, array[nxt]);
 	/* FALLTHROUGH */
 
 err1:	if (array != NULL) {
 		for (arrayp = array; *arrayp != NULL; ++arrayp)
-			__os_freestr(*arrayp);
-		__os_free(array, array_size * sizeof(char *));
+			__os_free(dbenv, *arrayp);
+		__os_free(dbenv, array);
 	}
 	return (ret);
 }
@@ -379,7 +419,7 @@ __absname(dbenv, pref, name, newnamep)
 
 	/* Malloc space for concatenating the two. */
 	if ((ret = __os_malloc(dbenv,
-	    l_pref + l_name + 2, NULL, &newname)) != 0)
+	    l_pref + l_name + 2, &newname)) != 0)
 		return (ret);
 	*newnamep = newname;
 
@@ -400,10 +440,9 @@ __absname(dbenv, pref, name, newnamep)
  *	If the user has their own malloc routine, use it.
  */
 static int
-__usermem(dbenv, listp, db_malloc)
+__usermem(dbenv, listp)
 	DB_ENV *dbenv;
 	char ***listp;
-	void *(*db_malloc) __P((size_t));
 {
 	size_t len;
 	int ret;
@@ -415,7 +454,7 @@ __usermem(dbenv, listp, db_malloc)
 	len += sizeof(char *);
 
 	/* Allocate it and set up the pointers. */
-	if ((ret = __os_malloc(dbenv, len, db_malloc, &array)) != 0)
+	if ((ret = __os_umalloc(dbenv, len, &array)) != 0)
 		return (ret);
 
 	strp = (char *)(array + (orig - *listp) + 1);
@@ -427,13 +466,13 @@ __usermem(dbenv, listp, db_malloc)
 		*arrayp = strp;
 		strp += len + 1;
 
-		__os_freestr(*orig);
+		__os_free(dbenv, *orig);
 	}
 
 	/* NULL-terminate the list. */
 	*arrayp = NULL;
 
-	__os_free(*listp, 0);
+	__os_free(dbenv, *listp);
 	*listp = array;
 
 	return (0);
diff --git a/bdb/log/log_compare.c b/bdb/log/log_compare.c
index 9bc3c028a5f..115f9c21b76 100644
--- a/bdb/log/log_compare.c
+++ b/bdb/log/log_compare.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: log_compare.c,v 11.3 2000/02/14 02:59:59 bostic Exp $";
+static const char revid[] = "$Id: log_compare.c,v 11.6 2002/01/11 15:52:50 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -19,6 +19,8 @@ static const char revid[] = "$Id: log_compare.c,v 11.3 2000/02/14 02:59:59 bosti
 /*
  * log_compare --
  *	Compare two LSN's; return 1, 0, -1 if first is >, == or < second.
+ *
+ * EXTERN: int log_compare __P((const DB_LSN *, const DB_LSN *));
  */
 int
 log_compare(lsn0, lsn1)
diff --git a/bdb/log/log_findckp.c b/bdb/log/log_findckp.c
deleted file mode 100644
index b1e8fddbdb7..00000000000
--- a/bdb/log/log_findckp.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
- *	Sleepycat Software.  All rights reserved.
- */
-
-#include "db_config.h"
-
-#ifndef lint
-static const char revid[] = "$Id: log_findckp.c,v 11.5 2000/11/30 00:58:40 ubell Exp $";
-#endif /* not lint */
-
-#ifndef NO_SYSTEM_INCLUDES
-#include <sys/types.h>
-
-#include <string.h>
-#endif
-
-#include "db_int.h"
-#include "log.h"
-#include "txn.h"
-
-/*
- * __log_findckp --
- *
- * Looks for the most recent checkpoint that occurs before the most recent
- * checkpoint LSN, subject to the constraint that there must be at least two
- * checkpoints.  The reason you need two checkpoints is that you might have
- * crashed during the most recent one and may not have a copy of all the
- * open files.  This is the point from which recovery can start and the
- * point up to which archival/truncation can take place.  Checkpoints in
- * the log look like:
- *
- * -------------------------------------------------------------------
- *  | ckp A, ckplsn 100 |  .... record .... | ckp B, ckplsn 600 | ...
- * -------------------------------------------------------------------
- *         LSN 500                                 LSN 1000
- *
- * If we read what log returns from using the DB_CKP parameter to logput,
- * we'll get the record at LSN 1000.  The checkpoint LSN there is 600.
- * Now we have to scan backwards looking for a checkpoint before LSN 600.
- * We find one at 500.  This means that we can truncate the log before
- * 500 or run recovery beginning at 500.
- *
- * Returns 0 if we find a suitable checkpoint or we retrieved the first
- * record in the log from which to start.  Returns DB_NOTFOUND if there
- * are no log records, errno on error.
- *
- * PUBLIC: int __log_findckp __P((DB_ENV *, DB_LSN *));
- */
-int
-__log_findckp(dbenv, lsnp)
-	DB_ENV *dbenv;
-	DB_LSN *lsnp;
-{
-	DBT data;
-	DB_LSN ckp_lsn, final_ckp, last_ckp, next_lsn;
-	__txn_ckp_args *ckp_args;
-	int ret;
-
-	/*
-	 * Need to find the appropriate point from which to begin
-	 * recovery.
-	 */
-	memset(&data, 0, sizeof(data));
-	if (F_ISSET(dbenv, DB_ENV_THREAD))
-		F_SET(&data, DB_DBT_MALLOC);
-	ZERO_LSN(ckp_lsn);
-	if ((ret = log_get(dbenv, &last_ckp, &data, DB_CHECKPOINT)) != 0) {
-		if (ret == ENOENT)
-			goto get_first;
-		else
-			return (ret);
-	}
-	final_ckp = last_ckp;
-
-	next_lsn = last_ckp;
-	do {
-		if (F_ISSET(dbenv, DB_ENV_THREAD))
-			__os_free(data.data, data.size);
-
-		if ((ret = log_get(dbenv, &next_lsn, &data, DB_SET)) != 0)
-			return (ret);
-		if ((ret = __txn_ckp_read(dbenv, data.data, &ckp_args)) != 0) {
-			if (F_ISSET(dbenv, DB_ENV_THREAD))
-				__os_free(data.data, data.size);
-			return (ret);
-		}
-		if (IS_ZERO_LSN(ckp_lsn))
-			ckp_lsn = ckp_args->ckp_lsn;
-		if (FLD_ISSET(dbenv->verbose, DB_VERB_CHKPOINT)) {
-			__db_err(dbenv, "Checkpoint at: [%lu][%lu]",
-			    (u_long)last_ckp.file, (u_long)last_ckp.offset);
-			__db_err(dbenv, "Checkpoint LSN: [%lu][%lu]",
-			    (u_long)ckp_args->ckp_lsn.file,
-			    (u_long)ckp_args->ckp_lsn.offset);
-			__db_err(dbenv, "Previous checkpoint: [%lu][%lu]",
-			    (u_long)ckp_args->last_ckp.file,
-			    (u_long)ckp_args->last_ckp.offset);
-		}
-		last_ckp = next_lsn;
-		next_lsn = ckp_args->last_ckp;
-		__os_free(ckp_args, sizeof(*ckp_args));
-
-		/*
-		 * Keep looping until either you 1) run out of checkpoints,
-		 * 2) you've found a checkpoint before the most recent
-		 * checkpoint's LSN and you have at least 2 checkpoints.
-		 */
-	} while (!IS_ZERO_LSN(next_lsn) &&
-	    (log_compare(&last_ckp, &ckp_lsn) > 0 ||
-	    log_compare(&final_ckp, &last_ckp) == 0));
-
-	if (F_ISSET(dbenv, DB_ENV_THREAD))
-		__os_free(data.data, data.size);
-
-	/*
-	 * At this point, either, next_lsn is ZERO or ckp_lsn is the
-	 * checkpoint lsn and last_ckp is the LSN of the last checkpoint
-	 * before ckp_lsn.  If the compare in the loop is still true, then
-	 * next_lsn must be 0 and we need to roll forward from the
-	 * beginning of the log.
-	 */
-	if (log_compare(&last_ckp, &ckp_lsn) >= 0 ||
-	    log_compare(&final_ckp, &last_ckp) == 0) {
-get_first:	if ((ret = log_get(dbenv, &last_ckp, &data, DB_FIRST)) != 0)
-			return (ret);
-		if (F_ISSET(dbenv, DB_ENV_THREAD))
-			__os_free(data.data, data.size);
-	}
-	*lsnp = last_ckp;
-
-	return (IS_ZERO_LSN(last_ckp) ? DB_NOTFOUND : 0);
-}
diff --git a/bdb/log/log_get.c b/bdb/log/log_get.c
index b75d50a62fd..c8b028da0fb 100644
--- a/bdb/log/log_get.c
+++ b/bdb/log/log_get.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: log_get.c,v 11.32 2001/01/11 18:19:53 bostic Exp $";
+static const char revid[] = "$Id: log_get.c,v 11.81 2002/08/14 20:09:27 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -17,81 +17,175 @@ static const char revid[] = "$Id: log_get.c,v 11.32 2001/01/11 18:19:53 bostic E
 #include <unistd.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_page.h"
-#include "log.h"
-#include "hash.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+#include "dbinc/hash.h"
 
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+typedef enum { L_ALREADY, L_ACQUIRED, L_NONE } RLOCK;
+
+static int __log_c_close __P((DB_LOGC *, u_int32_t));
+static int __log_c_get __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+static int __log_c_get_int __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+static int __log_c_hdrchk __P((DB_LOGC *, HDR *, int *));
+static int __log_c_incursor __P((DB_LOGC *, DB_LSN *, HDR *, u_int8_t **));
+static int __log_c_inregion __P((DB_LOGC *,
+	       DB_LSN *, RLOCK *, DB_LSN *, HDR *, u_int8_t **));
+static int __log_c_io __P((DB_LOGC *,
+	       u_int32_t, u_int32_t, void *, size_t *, int *));
+static int __log_c_ondisk __P((DB_LOGC *,
+	       DB_LSN *, DB_LSN *, int, HDR *, u_int8_t **, int *));
+static int __log_c_set_maxrec __P((DB_LOGC *, char *));
+static int __log_c_shortread __P((DB_LOGC *, int));
 
 /*
- * log_get --
- *	Get a log record.
+ * __log_cursor --
+ *	Create a log cursor.
+ *
+ * PUBLIC: int __log_cursor __P((DB_ENV *, DB_LOGC **, u_int32_t));
  */
 int
-log_get(dbenv, alsn, dbt, flags)
+__log_cursor(dbenv, logcp, flags)
+	DB_ENV *dbenv;
+	DB_LOGC **logcp;
+	u_int32_t flags;
+{
+	DB_LOGC *logc;
+	int ret;
+
+	PANIC_CHECK(dbenv);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->lg_handle, "DB_ENV->log_cursor", DB_INIT_LOG);
+
+	*logcp = NULL;
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(dbenv, "DB_ENV->log_cursor", flags, 0)) != 0)
+		return (ret);
+
+	/* Allocate memory for the cursor. */
+	if ((ret = __os_calloc(dbenv, 1, sizeof(DB_LOGC), &logc)) != 0)
+		goto err;
+	if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &logc->c_fh)) != 0)
+		goto err;
+
+	logc->bp_size = DB_LOGC_BUF_SIZE;
+	if ((ret = __os_malloc(dbenv, logc->bp_size, &logc->bp)) != 0)
+		goto err;
+
+	logc->dbenv = dbenv;
+	logc->close = __log_c_close;
+	logc->get = __log_c_get;
+
+	*logcp = logc;
+	return (0);
+
+err:	if (logc != NULL) {
+		if (logc->c_fh != NULL)
+			__os_free(dbenv, logc->c_fh);
+		__os_free(dbenv, logc);
+	}
+
+	return (ret);
+}
+
+/*
+ * __log_c_close --
+ *	Close a log cursor.
+ */
+static int
+__log_c_close(logc, flags)
+	DB_LOGC *logc;
+	u_int32_t flags;
+{
 	DB_ENV *dbenv;
+	int ret;
+
+	dbenv = logc->dbenv;
+
+	PANIC_CHECK(dbenv);
+	if ((ret = __db_fchk(dbenv, "DB_LOGC->close", flags, 0)) != 0)
+		return (ret);
+
+	if (F_ISSET(logc->c_fh, DB_FH_VALID))
+		(void)__os_closehandle(dbenv, logc->c_fh);
+
+	if (logc->c_dbt.data != NULL)
+		__os_free(dbenv, logc->c_dbt.data);
+
+	__os_free(dbenv, logc->bp);
+	__os_free(dbenv, logc->c_fh);
+	__os_free(dbenv, logc);
+
+	return (0);
+}
+
+/*
+ * __log_c_get --
+ *	Get a log record.
+ */
+static int
+__log_c_get(logc, alsn, dbt, flags)
+	DB_LOGC *logc;
 	DB_LSN *alsn;
 	DBT *dbt;
 	u_int32_t flags;
 {
-	DB_LOG *dblp;
+	DB_ENV *dbenv;
 	DB_LSN saved_lsn;
 	int ret;
 
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_log_get(dbenv, alsn, dbt, flags));
-#endif
+	dbenv = logc->dbenv;
 
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
 
 	/* Validate arguments. */
-	if (flags != DB_CHECKPOINT && flags != DB_CURRENT &&
-	    flags != DB_FIRST && flags != DB_LAST &&
-	    flags != DB_NEXT && flags != DB_PREV && flags != DB_SET)
-		return (__db_ferr(dbenv, "log_get", 1));
-
-	if (F_ISSET(dbenv, DB_ENV_THREAD)) {
-		if (flags == DB_NEXT || flags == DB_PREV || flags == DB_CURRENT)
-			return (__db_ferr(dbenv, "log_get", 1));
-		if (!F_ISSET(dbt,
-		    DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERMEM))
-			return (__db_ferr(dbenv, "threaded data", 1));
+	switch (flags) {
+	case DB_CURRENT:
+	case DB_FIRST:
+	case DB_LAST:
+	case DB_NEXT:
+	case DB_PREV:
+		break;
+	case DB_SET:
+		if (IS_ZERO_LSN(*alsn)) {
+			__db_err(dbenv, "DB_LOGC->get: invalid LSN");
+			return (EINVAL);
+		}
+		break;
+	default:
+		return (__db_ferr(dbenv, "DB_LOGC->get", 1));
 	}
 
-	dblp = dbenv->lg_handle;
-	R_LOCK(dbenv, &dblp->reginfo);
-
 	/*
-	 * The alsn field is only initialized if DB_SET is the flag, so this
-	 * assignment causes uninitialized memory complaints for other flag
-	 * values.
+	 * On error, we take care not to overwrite the caller's LSN.  This
+	 * is because callers looking for the end of the log loop using the
+	 * DB_NEXT flag, and expect to take the last successful lsn out of
+	 * the passed-in structure after DB_LOGC->get fails with DB_NOTFOUND.
+	 *
+	 * !!!
+	 * This line is often flagged an uninitialized memory read during a
+	 * Purify or similar tool run, as the application didn't initialize
+	 * *alsn.  If the application isn't setting the DB_SET flag, there is
+	 * no reason it should have initialized *alsn, but we can't know that
+	 * and we want to make sure we never overwrite whatever the application
+	 * put in there.
 	 */
-#ifdef	UMRW
-	if (flags == DB_SET)
-		saved_lsn = *alsn;
-	else
-		ZERO_LSN(saved_lsn);
-#else
 	saved_lsn = *alsn;
-#endif
 
 	/*
-	 * If we get one of the log's header records, repeat the operation.
-	 * This assumes that applications don't ever request the log header
-	 * records by LSN, but that seems reasonable to me.
+	 * If we get one of the log's header records as a result of doing a
+	 * DB_FIRST, DB_NEXT, DB_LAST or DB_PREV, repeat the operation, log
+	 * file header records aren't useful to applications.
 	 */
-	if ((ret = __log_get(dblp,
-	    alsn, dbt, flags, 0)) == 0 && alsn->offset == 0) {
+	if ((ret = __log_c_get_int(logc, alsn, dbt, flags)) != 0) {
+		*alsn = saved_lsn;
+		return (ret);
+	}
+	if (alsn->offset == 0 && (flags == DB_FIRST ||
+	    flags == DB_NEXT || flags == DB_LAST || flags == DB_PREV)) {
 		switch (flags) {
 		case DB_FIRST:
 			flags = DB_NEXT;
@@ -101,92 +195,100 @@ log_get(dbenv, alsn, dbt, flags)
 			break;
 		}
 		if (F_ISSET(dbt, DB_DBT_MALLOC)) {
-			__os_free(dbt->data, dbt->size);
+			__os_free(dbenv, dbt->data);
 			dbt->data = NULL;
 		}
-		ret = __log_get(dblp, alsn, dbt, flags, 0);
+		if ((ret = __log_c_get_int(logc, alsn, dbt, flags)) != 0) {
+			*alsn = saved_lsn;
+			return (ret);
+		}
 	}
-	if (ret != 0)
-		*alsn = saved_lsn;
 
-	R_UNLOCK(dbenv, &dblp->reginfo);
-
-	return (ret);
+	return (0);
 }
 
 /*
- * __log_get --
+ * __log_c_get_int --
  *	Get a log record; internal version.
- *
- * PUBLIC: int __log_get __P((DB_LOG *, DB_LSN *, DBT *, u_int32_t, int));
  */
-int
-__log_get(dblp, alsn, dbt, flags, silent)
-	DB_LOG *dblp;
+static int
+__log_c_get_int(logc, alsn, dbt, flags)
+	DB_LOGC *logc;
 	DB_LSN *alsn;
 	DBT *dbt;
 	u_int32_t flags;
-	int silent;
 {
+	DB_CIPHER *db_cipher;
 	DB_ENV *dbenv;
-	DB_LSN nlsn;
+	DB_LOG *dblp;
+	DB_LSN last_lsn, nlsn;
 	HDR hdr;
 	LOG *lp;
-	const char *fail;
-	char *np, *tbuf;
-	int cnt, ret;
+	RLOCK rlock;
 	logfile_validity status;
-	size_t len, nr;
-	u_int32_t offset;
-	u_int8_t *p;
-	void *shortp, *readp;
+	u_int32_t cnt;
+	u_int8_t *rp;
+	int eof, is_hmac, ret;
 
+	dbenv = logc->dbenv;
+	dblp = dbenv->lg_handle;
 	lp = dblp->reginfo.primary;
-	fail = np = tbuf = NULL;
-	dbenv = dblp->dbenv;
+	is_hmac = 0;
 
-	nlsn = dblp->c_lsn;
+	/*
+	 * We don't acquire the log region lock until we need it, and we
+	 * release it as soon as we're done.
+	 */
+	rlock = F_ISSET(logc, DB_LOG_LOCKED) ? L_ALREADY : L_NONE;
+
+	nlsn = logc->c_lsn;
 	switch (flags) {
-	case DB_CHECKPOINT:
-		nlsn = lp->chkpt_lsn;
-		if (IS_ZERO_LSN(nlsn)) {
-			/* No db_err. The caller may expect this. */
-			ret = ENOENT;
-			goto err2;
-		}
-		break;
 	case DB_NEXT:				/* Next log record. */
 		if (!IS_ZERO_LSN(nlsn)) {
 			/* Increment the cursor by the cursor record size. */
-			nlsn.offset += dblp->c_len;
+			nlsn.offset += logc->c_len;
 			break;
 		}
+		flags = DB_FIRST;
 		/* FALLTHROUGH */
-	case DB_FIRST:				/* Find the first log record. */
+	case DB_FIRST:				/* First log record. */
 		/* Find the first log file. */
 		if ((ret = __log_find(dblp, 1, &cnt, &status)) != 0)
-			goto err2;
+			goto err;
 
 		/*
-		 * We want any readable version, so either DB_LV_NORMAL
-		 * or DB_LV_OLD_READABLE is acceptable here.  If it's
-		 * not one of those two, there is no first log record that
-		 * we can read.
+		 * DB_LV_INCOMPLETE:
+		 *	Theoretically, the log file we want could be created
+		 *	but not yet written, the "first" log record must be
+		 *	in the log buffer.
+		 * DB_LV_NORMAL:
+		 * DB_LV_OLD_READABLE:
+		 *	We found a log file we can read.
+		 * DB_LV_NONEXISTENT:
+		 *	No log files exist, the "first" log record must be in
+		 *	the log buffer.
+		 * DB_LV_OLD_UNREADABLE:
+		 *	No readable log files exist, we're at the cross-over
+		 *	point between two versions.  The "first" log record
+		 *	must be in the log buffer.
 		 */
-		if (status != DB_LV_NORMAL && status != DB_LV_OLD_READABLE) {
-			ret = DB_NOTFOUND;
-			goto err2;
+		switch (status) {
+		case DB_LV_INCOMPLETE:
+			DB_ASSERT(lp->lsn.file == cnt);
+			/* FALLTHROUGH */
+		case DB_LV_NORMAL:
+		case DB_LV_OLD_READABLE:
+			nlsn.file = cnt;
+			break;
+		case DB_LV_NONEXISTENT:
+			nlsn.file = 1;
+			DB_ASSERT(lp->lsn.file == nlsn.file);
+			break;
+		case DB_LV_OLD_UNREADABLE:
+			nlsn.file = cnt + 1;
+			DB_ASSERT(lp->lsn.file == nlsn.file);
+			break;
 		}
-
-		/*
-		 * We may have only entered records in the buffer, and not
-		 * yet written a log file.  If no log files were found and
-		 * there's anything in the buffer, it belongs to file 1.
-		 */
-		if (cnt == 0)
-			cnt = 1;
-
-		nlsn.file = cnt;
 		nlsn.offset = 0;
 		break;
 	case DB_CURRENT:			/* Current log record. */
@@ -197,21 +299,28 @@ __log_get(dblp, alsn, dbt, flags, silent)
 			if (nlsn.offset == 0) {
 				if (nlsn.file == 1 ||
 				    __log_valid(dblp,
-					nlsn.file - 1, 0, &status) != 0)
-					return (DB_NOTFOUND);
+					nlsn.file - 1, 0, &status) != 0) {
+					ret = DB_NOTFOUND;
+					goto err;
+				}
 
 				if (status != DB_LV_NORMAL &&
-				    status != DB_LV_OLD_READABLE)
-					return (DB_NOTFOUND);
+				    status != DB_LV_OLD_READABLE) {
+					ret = DB_NOTFOUND;
+					goto err;
+				}
 
 				--nlsn.file;
-				nlsn.offset = dblp->c_off;
-			} else
-				nlsn.offset = dblp->c_off;
+			}
+			nlsn.offset = logc->c_prev;
 			break;
 		}
 		/* FALLTHROUGH */
 	case DB_LAST:				/* Last log record. */
+		if (rlock == L_NONE) {
+			rlock = L_ACQUIRED;
+			R_LOCK(dbenv, &dblp->reginfo);
+		}
 		nlsn.file = lp->lsn.file;
 		nlsn.offset = lp->lsn.offset - lp->len;
 		break;
@@ -225,241 +334,725 @@ next_file:	++nlsn.file;
 		nlsn.offset = 0;
 	}
 
-	/* Return 1 if the request is past the end of the log. */
-	if (nlsn.file > lp->lsn.file ||
-	    (nlsn.file == lp->lsn.file && nlsn.offset >= lp->lsn.offset))
-		return (DB_NOTFOUND);
+	/*
+	 * The above switch statement should have set nlsn to the lsn of
+	 * the requested record.
+	 */
 
-	/* If we've switched files, discard the current file handle. */
-	if (dblp->c_lsn.file != nlsn.file &&
-	    F_ISSET(&dblp->c_fh, DB_FH_VALID)) {
-		(void)__os_closehandle(&dblp->c_fh);
+	if (CRYPTO_ON(dbenv)) {
+		hdr.size = HDR_CRYPTO_SZ;
+		is_hmac = 1;
+	} else {
+		hdr.size = HDR_NORMAL_SZ;
+		is_hmac = 0;
 	}
-
-	/* If the entire record is in the in-memory buffer, copy it out. */
-	if (nlsn.file == lp->lsn.file && nlsn.offset >= lp->w_off) {
-		/* Copy the header. */
-		p = dblp->bufp + (nlsn.offset - lp->w_off);
-		memcpy(&hdr, p, sizeof(HDR));
-
-		/* Copy the record. */
-		len = hdr.len - sizeof(HDR);
-		if ((ret = __db_retcopy(NULL, dbt, p + sizeof(HDR),
-		    len, &dblp->c_dbt.data, &dblp->c_dbt.ulen)) != 0)
-			goto err2;
+	/* Check to see if the record is in the cursor's buffer. */
+	if ((ret = __log_c_incursor(logc, &nlsn, &hdr, &rp)) != 0)
+		goto err;
+	if (rp != NULL)
 		goto cksum;
-	}
 
-	shortp = NULL;
+	/*
+	 * Look to see if we're moving backward in the log with the last record
+	 * coming from the disk -- it means the record can't be in the region's
+	 * buffer.  Else, check the region's buffer.
+	 *
+	 * If the record isn't in the region's buffer, we're going to have to
+	 * read the record from disk.  We want to make a point of not reading
+	 * past the end of the logical log (after recovery, there may be data
+	 * after the end of the logical log, not to mention the log file may
+	 * have been pre-allocated).  So, zero out last_lsn, and initialize it
+	 * inside __log_c_inregion -- if it's still zero when we check it in
+	 * __log_c_ondisk, that's OK, it just means the logical end of the log
+	 * isn't an issue for this request.
+	 */
+	ZERO_LSN(last_lsn);
+	if (!F_ISSET(logc, DB_LOG_DISK) ||
+	    log_compare(&nlsn, &logc->c_lsn) > 0) {
+		F_CLR(logc, DB_LOG_DISK);
 
-	/* Acquire a file descriptor. */
-	if (!F_ISSET(&dblp->c_fh, DB_FH_VALID)) {
-		if ((ret = __log_name(dblp, nlsn.file,
-		    &np, &dblp->c_fh, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) {
-			fail = np;
-			goto err1;
-		}
-		__os_freestr(np);
-		np = NULL;
+		if ((ret = __log_c_inregion(logc,
+		    &nlsn, &rlock, &last_lsn, &hdr, &rp)) != 0)
+			goto err;
+		if (rp != NULL)
+			goto cksum;
 	}
 
-	/* See if we've already read this */
-	if (nlsn.file == dblp->r_file && nlsn.offset > dblp->r_off
-	     && nlsn.offset + sizeof(HDR) < dblp->r_off + dblp->r_size)
-		goto got_header;
-
 	/*
-	 * Seek to the header offset and read the header.  Because the file
-	 * may be pre-allocated, we have to make sure that we're not reading
-	 * past the information in the start of the in-memory buffer.
+	 * We have to read from an on-disk file to retrieve the record.
+	 * If we ever can't retrieve the record at offset 0, we're done,
+	 * return EOF/DB_NOTFOUND.
+	 *
+	 * Discard the region lock if we're still holding it, the on-disk
+	 * reading routines don't need it.
 	 */
-
-	readp = &hdr;
-	offset = nlsn.offset;
-	if (nlsn.file == lp->lsn.file && offset + sizeof(HDR) > lp->w_off)
-		nr = lp->w_off - offset;
-	else if (dblp->readbufp == NULL)
-		nr = sizeof(HDR);
-	else  {
-		nr = lp->buffer_size;
-		readp = dblp->readbufp;
-		dblp->r_file = nlsn.file;
-		/* Going backwards.  Put the current in the middle. */
-		if (flags == DB_PREV || flags == DB_LAST) {
-			if (offset <= lp->buffer_size/2)
-				offset = 0;
-			else
-				offset = offset - lp->buffer_size/2;
-		}
-		if (nlsn.file == lp->lsn.file && offset + nr > lp->lsn.offset)
-			nr = lp->lsn.offset - offset;
-		dblp->r_off = offset;
+	if (rlock == L_ACQUIRED) {
+		rlock = L_NONE;
+		R_UNLOCK(dbenv, &dblp->reginfo);
+	}
+	if ((ret = __log_c_ondisk(
+	    logc, &nlsn, &last_lsn, flags, &hdr, &rp, &eof)) != 0)
+		goto err;
+	if (eof == 1) {
+		/*
+		 * Only DB_NEXT automatically moves to the next file, and
+		 * it only happens once.
+		 */
+		if (flags != DB_NEXT || nlsn.offset == 0)
+			return (DB_NOTFOUND);
+		goto next_file;
 	}
+	F_SET(logc, DB_LOG_DISK);
 
-	if ((ret = __os_seek(dblp->dbenv,
-	    &dblp->c_fh, 0, 0, offset, 0, DB_OS_SEEK_SET)) != 0) {
-		fail = "seek";
-		goto err1;
+cksum:	/*
+	 * Discard the region lock if we're still holding it.  (The path to
+	 * get here is that we acquired the lock because of the caller's
+	 * flag argument, but we found the record in the cursor's buffer.
+	 * Improbable, but it's easy to avoid.
+	 */
+	if (rlock == L_ACQUIRED) {
+		rlock = L_NONE;
+		R_UNLOCK(dbenv, &dblp->reginfo);
 	}
-	if ((ret = __os_read(dblp->dbenv, &dblp->c_fh, readp, nr, &nr)) != 0) {
-		fail = "read";
-		goto err1;
+
+	/*
+	 * Checksum: there are two types of errors -- a configuration error
+	 * or a checksum mismatch.  The former is always bad.  The latter is
+	 * OK if we're searching for the end of the log, and very, very bad
+	 * if we're reading random log records.
+	 */
+	db_cipher = dbenv->crypto_handle;
+	if ((ret = __db_check_chksum(dbenv, db_cipher,
+	    hdr.chksum, rp + hdr.size, hdr.len - hdr.size, is_hmac)) != 0) {
+		if (F_ISSET(logc, DB_LOG_SILENT_ERR)) {
+			if (ret == 0 || ret == -1)
+				ret = EIO;
+		} else if (ret == -1) {
+			__db_err(dbenv,
+		    "DB_LOGC->get: log record checksum mismatch");
+			__db_err(dbenv,
+		    "DB_LOGC->get: catastrophic recovery may be required");
+			ret = __db_panic(dbenv, DB_RUNRECOVERY);
+		}
+		goto err;
 	}
-	if (nr < sizeof(HDR)) {
-		/* If read returns EOF, try the next file. */
-		if (nr == 0) {
-			if (flags != DB_NEXT || nlsn.file == lp->lsn.file)
-				goto corrupt;
+
+	/*
+	 * If we got a 0-length record, that means we're in the midst of
+	 * some bytes that got 0'd as the result of a vtruncate.  We're
+	 * going to have to retry.
+	 */
+	if (hdr.len == 0) {
+		switch (flags) {
+		case DB_FIRST:
+		case DB_NEXT:
+			/* Zero'd records always indicate the end of a file. */
 			goto next_file;
+
+		case DB_LAST:
+		case DB_PREV:
+			/*
+			 * We should never get here.  If we recover a log
+			 * file with 0's at the end, we'll treat the 0'd
+			 * headers as the end of log and ignore them.  If
+			 * we're reading backwards from another file, then
+			 * the first record in that new file should have its
+			 * prev field set correctly.
+			 */
+			 __db_err(dbenv,
+		"Encountered zero length records while traversing backwards");
+			 DB_ASSERT(0);
+		case DB_SET:
+		default:
+			/* Return the 0-length record. */
+			break;
 		}
+	}
 
-		if (dblp->readbufp != NULL)
-			memcpy((u_int8_t *) &hdr, readp, nr);
+	/* Copy the record into the user's DBT. */
+	if ((ret = __db_retcopy(dbenv, dbt, rp + hdr.size,
+	    (u_int32_t)(hdr.len - hdr.size),
+	    &logc->c_dbt.data, &logc->c_dbt.ulen)) != 0)
+		goto err;
 
+	if (CRYPTO_ON(dbenv)) {
+		if ((ret = db_cipher->decrypt(dbenv, db_cipher->data,
+		    hdr.iv, dbt->data, hdr.len - hdr.size)) != 0) {
+			ret = EAGAIN;
+			goto err;
+		}
 		/*
-		 * If read returns a short count the rest of the record has
-		 * to be in the in-memory buffer.
+		 * Return the original log record size to the user,
+		 * even though we've allocated more than that, possibly.
+		 * The log record is decrypted in the user dbt, not in
+		 * the buffer, so we must do this here after decryption,
+		 * not adjust the len passed to the __db_retcopy call.
 		 */
-		if (lp->b_off < sizeof(HDR) - nr)
-			goto corrupt;
+		dbt->size = hdr.orig_size;
+	}
 
-		/* Get the rest of the header from the in-memory buffer. */
-		memcpy((u_int8_t *)&hdr + nr, dblp->bufp, sizeof(HDR) - nr);
+	/* Update the cursor and the returned LSN. */
+	*alsn = nlsn;
+	logc->c_lsn = nlsn;
+	logc->c_len = hdr.len;
+	logc->c_prev = hdr.prev;
 
-		if (hdr.len == 0)
-			goto next_file;
+err:	if (rlock == L_ACQUIRED)
+		R_UNLOCK(dbenv, &dblp->reginfo);
 
-		shortp = dblp->bufp + (sizeof(HDR) - nr);
-	}
+	return (ret);
+}
 
-	else if (dblp->readbufp != NULL) {
-		dblp->r_size = nr;
-got_header:	memcpy((u_int8_t *)&hdr,
-		    dblp->readbufp + (nlsn.offset - dblp->r_off), sizeof(HDR));
-	}
+/*
+ * __log_c_incursor --
+ *	Check to see if the requested record is in the cursor's buffer.
+ */
+static int
+__log_c_incursor(logc, lsn, hdr, pp)
+	DB_LOGC *logc;
+	DB_LSN *lsn;
+	HDR *hdr;
+	u_int8_t **pp;
+{
+	u_int8_t *p;
+
+	*pp = NULL;
 
 	/*
-	 * Check for buffers of 0's, that's what we usually see during recovery,
-	 * although it's certainly not something on which we can depend.  Check
-	 * for impossibly large records.  The malloc should fail later, but we
-	 * have customers that run mallocs that handle allocation failure as a
-	 * fatal error.
+	 * Test to see if the requested LSN could be part of the cursor's
+	 * buffer.
+	 *
+	 * The record must be part of the same file as the cursor's buffer.
+	 * The record must start at a byte offset equal to or greater than
+	 * the cursor buffer.
+	 * The record must not start at a byte offset after the cursor
+	 * buffer's end.
 	 */
-	if (hdr.len == 0)
-		goto next_file;
-	if (hdr.len <= sizeof(HDR) || hdr.len > lp->persist.lg_max)
-		goto corrupt;
-	len = hdr.len - sizeof(HDR);
-
-	/* If we've already moved to the in-memory buffer, fill from there. */
-	if (shortp != NULL) {
-		if (lp->b_off < ((u_int8_t *)shortp - dblp->bufp) + len)
-			goto corrupt;
-		if ((ret = __db_retcopy(NULL, dbt, shortp, len,
-		    &dblp->c_dbt.data, &dblp->c_dbt.ulen)) != 0)
-			goto err2;
-		goto cksum;
-	}
+	if (logc->bp_lsn.file != lsn->file)
+		return (0);
+	if (logc->bp_lsn.offset > lsn->offset)
+		return (0);
+	if (logc->bp_lsn.offset + logc->bp_rlen <= lsn->offset + hdr->size)
+		return (0);
 
-	if (dblp->readbufp != NULL) {
-		if (nlsn.offset + hdr.len < dblp->r_off + dblp->r_size) {
-			if ((ret = __db_retcopy(NULL, dbt, dblp->readbufp +
-			     (nlsn.offset - dblp->r_off) + sizeof(HDR),
-			     len, &dblp->c_dbt.data, &dblp->c_dbt.ulen)) != 0)
-				goto err2;
-			goto cksum;
-		} else if ((ret = __os_seek(dblp->dbenv, &dblp->c_fh, 0,
-		    0, nlsn.offset + sizeof(HDR), 0, DB_OS_SEEK_SET)) != 0) {
-			fail = "seek";
-			goto err1;
-		}
+	/*
+	 * Read the record's header and check if the record is entirely held
+	 * in the buffer.  If the record is not entirely held, get it again.
+	 * (The only advantage in having part of the record locally is that
+	 * we might avoid a system call because we already have the HDR in
+	 * memory.)
+	 *
+	 * If the header check fails for any reason, it must be because the
+	 * LSN is bogus.  Fail hard.
+	 */
+	p = logc->bp + (lsn->offset - logc->bp_lsn.offset);
+	memcpy(hdr, p, hdr->size);
+	if (__log_c_hdrchk(logc, hdr, NULL))
+		return (DB_NOTFOUND);
+	if (logc->bp_lsn.offset + logc->bp_rlen <= lsn->offset + hdr->len)
+		return (0);
+
+	*pp = p;				/* Success. */
+
+	return (0);
+}
+
+/*
+ * __log_c_inregion --
+ *	Check to see if the requested record is in the region's buffer.
+ */
+static int
+__log_c_inregion(logc, lsn, rlockp, last_lsn, hdr, pp)
+	DB_LOGC *logc;
+	DB_LSN *lsn, *last_lsn;
+	RLOCK *rlockp;
+	HDR *hdr;
+	u_int8_t **pp;
+{
+	DB_ENV *dbenv;
+	DB_LOG *dblp;
+	LOG *lp;
+	size_t len, nr;
+	u_int32_t b_disk, b_region;
+	int ret;
+	u_int8_t *p;
+
+	dbenv = logc->dbenv;
+	dblp = dbenv->lg_handle;
+	lp = ((DB_LOG *)logc->dbenv->lg_handle)->reginfo.primary;
+
+	ret = 0;
+	*pp = NULL;
+
+	/* If we haven't yet acquired the log region lock, do so. */
+	if (*rlockp == L_NONE) {
+		*rlockp = L_ACQUIRED;
+		R_LOCK(dbenv, &dblp->reginfo);
 	}
 
 	/*
-	 * Allocate temporary memory to hold the record.
+	 * The routines to read from disk must avoid reading past the logical
+	 * end of the log, so pass that information back to it.
 	 *
-	 * XXX
-	 * We're calling malloc(3) with a region locked.  This isn't
-	 * a good idea.
+	 * Since they're reading directly from the disk, they must also avoid
+	 * reading past the offset we've written out.  If the log was
+	 * truncated, it's possible that there are zeroes or garbage on
+	 * disk after this offset, and the logical end of the log can
+	 * come later than this point if the log buffer isn't empty.
 	 */
-	if ((ret = __os_malloc(dbenv, len, NULL, &tbuf)) != 0)
-		goto err1;
+	*last_lsn = lp->lsn;
+	if (last_lsn->offset > lp->w_off)
+		last_lsn->offset = lp->w_off;
 
 	/*
-	 * Read the record into the buffer.  If read returns a short count,
-	 * there was an error or the rest of the record is in the in-memory
-	 * buffer.  Note, the information may be garbage if we're in recovery,
-	 * so don't read past the end of the buffer's memory.
-	 *
-	 * Because the file may be pre-allocated, we have to make sure that
-	 * we're not reading past the information in the start of the in-memory
+	 * Test to see if the requested LSN could be part of the region's
 	 * buffer.
+	 *
+	 * During recovery, we read the log files getting the information to
+	 * initialize the region.  In that case, the region's lsn field will
+	 * not yet have been filled in, use only the disk.
+	 *
+	 * The record must not start at a byte offset after the region buffer's
+	 * end, since that means the request is for a record after the end of
+	 * the log.  Do this test even if the region's buffer is empty -- after
+	 * recovery, the log files may continue past the declared end-of-log,
+	 * and the disk reading routine will incorrectly attempt to read the
+	 * remainder of the log.
+	 *
+	 * Otherwise, test to see if the region's buffer actually has what we
+	 * want:
+	 *
+	 * The buffer must have some useful content.
+	 * The record must be in the same file as the region's buffer and must
+	 * start at a byte offset equal to or greater than the region's buffer.
+	 */
+	if (IS_ZERO_LSN(lp->lsn))
+		return (0);
+	if (lsn->file > lp->lsn.file ||
+	    (lsn->file == lp->lsn.file && lsn->offset >= lp->lsn.offset))
+		return (DB_NOTFOUND);
+	if (lp->b_off == 0)
+		return (0);
+	if (lsn->file < lp->f_lsn.file || lsn->offset < lp->f_lsn.offset)
+		return (0);
+
+	/*
+	 * The current contents of the cursor's buffer will be useless for a
+	 * future call -- trash it rather than try and make it look correct.
+	 */
+	ZERO_LSN(logc->bp_lsn);
+
+	/*
+	 * If the requested LSN is greater than the region buffer's first
+	 * byte, we know the entire record is in the buffer.
+	 *
+	 * If the header check fails for any reason, it must be because the
+	 * LSN is bogus.  Fail hard.
 	 */
-	if (nlsn.file == lp->lsn.file &&
-	    nlsn.offset + sizeof(HDR) + len > lp->w_off)
-		nr = lp->w_off - (nlsn.offset + sizeof(HDR));
+	if (lsn->offset > lp->f_lsn.offset) {
+		p = dblp->bufp + (lsn->offset - lp->w_off);
+		memcpy(hdr, p, hdr->size);
+		if (__log_c_hdrchk(logc, hdr, NULL))
+			return (DB_NOTFOUND);
+		if (logc->bp_size <= hdr->len) {
+			len = ALIGN(hdr->len * 2, 128);
+			if ((ret =
+			    __os_realloc(logc->dbenv, len, &logc->bp)) != 0)
+				 return (ret);
+			logc->bp_size = (u_int32_t)len;
+		}
+		memcpy(logc->bp, p, hdr->len);
+		*pp = logc->bp;
+		return (0);
+	}
+
+	/*
+	 * There's a partial record, that is, the requested record starts
+	 * in a log file and finishes in the region buffer.  We have to
+	 * find out how many bytes of the record are in the region buffer
+	 * so we can copy them out into the cursor buffer.  First, check
+	 * to see if the requested record is the only record in the region
+	 * buffer, in which case we should copy the entire region buffer.
+	 *
+	 * Else, walk back through the region's buffer to find the first LSN
+	 * after the record that crosses the buffer boundary -- we can detect
+	 * that LSN, because its "prev" field will reference the record we
+	 * want.  The bytes we need to copy from the region buffer are the
+	 * bytes up to the record we find.  The bytes we'll need to allocate
+	 * to hold the log record are the bytes between the two offsets.
+	 */
+	b_disk = lp->w_off - lsn->offset;
+	if (lp->b_off <= lp->len)
+		b_region = (u_int32_t)lp->b_off;
 	else
-		nr = len;
-	if ((ret = __os_read(dblp->dbenv, &dblp->c_fh, tbuf, nr, &nr)) != 0) {
-		fail = "read";
-		goto err1;
+		for (p = dblp->bufp + (lp->b_off - lp->len);;) {
+			memcpy(hdr, p, hdr->size);
+			if (hdr->prev == lsn->offset) {
+				b_region = (u_int32_t)(p - dblp->bufp);
+				break;
+			}
+			p = dblp->bufp + (hdr->prev - lp->w_off);
+		}
+
+	/*
+	 * If we don't have enough room for the record, we have to allocate
+	 * space.  We have to do it while holding the region lock, which is
+	 * truly annoying, but there's no way around it.  This call is why
+	 * we allocate cursor buffer space when allocating the cursor instead
+	 * of waiting.
+	 */
+	if (logc->bp_size <= b_region + b_disk) {
+		len = ALIGN((b_region + b_disk) * 2, 128);
+		if ((ret = __os_realloc(logc->dbenv, len, &logc->bp)) != 0)
+			return (ret);
+		logc->bp_size = (u_int32_t)len;
 	}
-	if (len - nr > lp->buffer_size)
-		goto corrupt;
-	if (nr != len) {
-		if (lp->b_off < len - nr)
-			goto corrupt;
-
-		/* Get the rest of the record from the in-memory buffer. */
-		memcpy((u_int8_t *)tbuf + nr, dblp->bufp, len - nr);
+
+	/* Copy the region's bytes to the end of the cursor's buffer. */
+	p = (logc->bp + logc->bp_size) - b_region;
+	memcpy(p, dblp->bufp, b_region);
+
+	/* Release the region lock. */
+	if (*rlockp == L_ACQUIRED) {
+		*rlockp = L_NONE;
+		R_UNLOCK(dbenv, &dblp->reginfo);
 	}
 
-	/* Copy the record into the user's DBT. */
-	if ((ret = __db_retcopy(NULL, dbt, tbuf, len,
-	    &dblp->c_dbt.data, &dblp->c_dbt.ulen)) != 0)
-		goto err2;
-	__os_free(tbuf, 0);
-	tbuf = NULL;
+	/*
+	 * Read the rest of the information from disk.  Neither short reads
+	 * or EOF are acceptable, the bytes we want had better be there.
+	 */
+	if (b_disk != 0) {
+		p -= b_disk;
+		nr = b_disk;
+		if ((ret = __log_c_io(
+		    logc, lsn->file, lsn->offset, p, &nr, NULL)) != 0)
+			return (ret);
+		if (nr < b_disk)
+			return (__log_c_shortread(logc, 0));
+	}
 
-cksum:	/*
-	 * If the user specified a partial record read, the checksum can't
-	 * match.  It's not an obvious thing to do, but a user testing for
-	 * the length of a record might do it.
+	/* Copy the header information into the caller's structure. */
+	memcpy(hdr, p, hdr->size);
+
+	*pp = p;
+	return (0);
+}
+
+/*
+ * __log_c_ondisk --
+ *	Read a record off disk.
+ */
+static int
+__log_c_ondisk(logc, lsn, last_lsn, flags, hdr, pp, eofp)
+	DB_LOGC *logc;
+	DB_LSN *lsn, *last_lsn;
+	int flags, *eofp;
+	HDR *hdr;
+	u_int8_t **pp;
+{
+	DB_ENV *dbenv;
+	size_t len, nr;
+	u_int32_t offset;
+	int ret;
+
+	dbenv = logc->dbenv;
+	*eofp = 0;
+
+	nr = hdr->size;
+	if ((ret =
+	    __log_c_io(logc, lsn->file, lsn->offset, hdr, &nr, eofp)) != 0)
+		return (ret);
+	if (*eofp)
+		return (0);
+
+	/* If we read 0 bytes, assume we've hit EOF. */
+	if (nr == 0) {
+		*eofp = 1;
+		return (0);
+	}
+
+	/* Check the HDR. */
+	if ((ret = __log_c_hdrchk(logc, hdr, eofp)) != 0)
+		return (ret);
+	if (*eofp)
+		return (0);
+
+	/* Otherwise, we should have gotten the bytes we wanted. */
+	if (nr < hdr->size)
+		return (__log_c_shortread(logc, 0));
+
+	/*
+	 * Regardless of how we return, the previous contents of the cursor's
+	 * buffer are useless -- trash it.
 	 */
-	if (!F_ISSET(dbt, DB_DBT_PARTIAL) &&
-	    hdr.cksum != __ham_func4(NULL, dbt->data, dbt->size)) {
-		if (!silent)
-			__db_err(dbenv, "log_get: checksum mismatch");
-		goto corrupt;
+	ZERO_LSN(logc->bp_lsn);
+
+	/*
+	 * Otherwise, we now (finally!) know how big the record is.  (Maybe
+	 * we should have just stuck the length of the record into the LSN!?)
+	 * Make sure we have enough space.
+	 */
+	if (logc->bp_size <= hdr->len) {
+		len = ALIGN(hdr->len * 2, 128);
+		if ((ret = __os_realloc(dbenv, len, &logc->bp)) != 0)
+			return (ret);
+		logc->bp_size = (u_int32_t)len;
 	}
 
-	/* Update the cursor and the return lsn. */
-	dblp->c_off = hdr.prev;
-	dblp->c_len = hdr.len;
-	dblp->c_lsn = nlsn;
-	*alsn = nlsn;
+	/*
+	 * If we're moving forward in the log file, read this record in at the
+	 * beginning of the buffer.  Otherwise, read this record in at the end
+	 * of the buffer, making sure we don't try and read before the start
+	 * of the file.  (We prefer positioning at the end because transaction
+	 * aborts use DB_SET to move backward through the log and we might get
+	 * lucky.)
+	 *
+	 * Read a buffer's worth, without reading past the logical EOF.  The
+	 * last_lsn may be a zero LSN, but that's OK, the test works anyway.
+	 */
+	if (flags == DB_FIRST || flags == DB_NEXT)
+		offset = lsn->offset;
+	else if (lsn->offset + hdr->len < logc->bp_size)
+		offset = 0;
+	else
+		offset = (lsn->offset + hdr->len) - logc->bp_size;
+
+	nr = logc->bp_size;
+	if (lsn->file == last_lsn->file && offset + nr >= last_lsn->offset)
+		nr = last_lsn->offset - offset;
+
+	if ((ret =
+	    __log_c_io(logc, lsn->file, offset, logc->bp, &nr, eofp)) != 0)
+		return (ret);
+
+	/*
+	 * We should have at least gotten the bytes up-to-and-including the
+	 * record we're reading.
+	 */
+	if (nr < (lsn->offset + hdr->len) - offset)
+		return (__log_c_shortread(logc, 1));
+
+	/* Set up the return information. */
+	logc->bp_rlen = (u_int32_t)nr;
+	logc->bp_lsn.file = lsn->file;
+	logc->bp_lsn.offset = offset;
 
+	*pp = logc->bp + (lsn->offset - offset);
+
+	return (0);
+}
+
+/*
+ * __log_c_hdrchk --
+ *
+ * Check for corrupted HDRs before we use them to allocate memory or find
+ * records.
+ *
+ * If the log files were pre-allocated, a zero-filled HDR structure is the
+ * logical file end.  However, we can see buffers filled with 0's during
+ * recovery, too (because multiple log buffers were written asynchronously,
+ * and one made it to disk before a different one that logically precedes
+ * it in the log file.
+ *
+ * XXX
+ * I think there's a potential pre-allocation recovery flaw here -- if we
+ * fail to write a buffer at the end of a log file (by scheduling its
+ * write asynchronously, and it never making it to disk), then succeed in
+ * writing a log file block to a subsequent log file, I don't think we will
+ * detect that the buffer of 0's should have marked the end of the log files
+ * during recovery.  I think we may need to always write some garbage after
+ * each block write if we pre-allocate log files.  (At the moment, we do not
+ * pre-allocate, so this isn't currently an issue.)
+ *
+ * Check for impossibly large records.  The malloc should fail later, but we
+ * have customers that run mallocs that treat all allocation failures as fatal
+ * errors.
+ *
+ * Note that none of this is necessarily something awful happening.  We let
+ * the application hand us any LSN they want, and it could be a pointer into
+ * the middle of a log record, there's no way to tell.
+ */
+static int
+__log_c_hdrchk(logc, hdr, eofp)
+	DB_LOGC *logc;
+	HDR *hdr;
+	int *eofp;
+{
+	DB_ENV *dbenv;
+	int ret;
+
+	dbenv = logc->dbenv;
+
+	/* Sanity check the log record's size. */
+	if (hdr->len <= hdr->size)
+		goto err;
+	/*
+	 * If the cursor's max-record value isn't yet set, it means we aren't
+	 * reading these records from a log file and no check is necessary.
+	 */
+	if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec) {
+		/*
+		 * If we fail the check, there's the pathological case that
+		 * we're reading the last file, it's growing, and our initial
+		 * check information was wrong.  Get it again, to be sure.
+		 */
+		if ((ret = __log_c_set_maxrec(logc, NULL)) != 0) {
+			__db_err(dbenv, "DB_LOGC->get: %s", db_strerror(ret));
+			return (ret);
+		}
+		if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec)
+			goto err;
+	}
+
+	if (eofp != NULL) {
+		if (hdr->prev == 0 && hdr->chksum[0] == 0 && hdr->len == 0) {
+			*eofp = 1;
+			return (0);
+		}
+		*eofp = 0;
+	}
 	return (0);
 
-corrupt:/*
-	 * This is the catchall -- for some reason we didn't find enough
-	 * information or it wasn't reasonable information, and it wasn't
-	 * because a system call failed.
+err:	if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+		__db_err(dbenv, "DB_LOGC->get: invalid log record header");
+	return (EIO);
+}
+
+/*
+ * __log_c_io --
+ *	Read records from a log file.
+ */
+static int
+__log_c_io(logc, fnum, offset, p, nrp, eofp)
+	DB_LOGC *logc;
+	u_int32_t fnum, offset;
+	void *p;
+	size_t *nrp;
+	int *eofp;
+{
+	DB_ENV *dbenv;
+	DB_LOG *dblp;
+	int ret;
+	char *np;
+
+	dbenv = logc->dbenv;
+	dblp = dbenv->lg_handle;
+
+	/*
+	 * If we've switched files, discard the current file handle and acquire
+	 * a new one.
 	 */
-	ret = EIO;
-	fail = "read";
+	if (F_ISSET(logc->c_fh, DB_FH_VALID) && logc->bp_lsn.file != fnum)
+		if ((ret = __os_closehandle(dbenv, logc->c_fh)) != 0)
+			return (ret);
+	if (!F_ISSET(logc->c_fh, DB_FH_VALID)) {
+		if ((ret = __log_name(dblp, fnum,
+		    &np, logc->c_fh, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) {
+			/*
+			 * If we're allowed to return EOF, assume that's the
+			 * problem, set the EOF status flag and return 0.
+			 */
+			if (eofp != NULL) {
+				*eofp = 1;
+				ret = 0;
+			} else if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+				__db_err(dbenv, "DB_LOGC->get: %s: %s",
+				    np, db_strerror(ret));
+			__os_free(dbenv, np);
+			return (ret);
+		}
 
-err1:	if (!silent) {
-		if (fail == NULL)
-			__db_err(dbenv, "log_get: %s", db_strerror(ret));
-		else
+		if ((ret = __log_c_set_maxrec(logc, np)) != 0) {
 			__db_err(dbenv,
-			    "log_get: %s: %s", fail, db_strerror(ret));
+			    "DB_LOGC->get: %s: %s", np, db_strerror(ret));
+			__os_free(dbenv, np);
+			return (ret);
+		}
+		__os_free(dbenv, np);
 	}
 
-err2:	if (np != NULL)
-		__os_freestr(np);
-	if (tbuf != NULL)
-		__os_free(tbuf, 0);
-	return (ret);
+	/* Seek to the record's offset. */
+	if ((ret = __os_seek(dbenv,
+	    logc->c_fh, 0, 0, offset, 0, DB_OS_SEEK_SET)) != 0) {
+		if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+			__db_err(dbenv,
+			    "DB_LOGC->get: seek: %s", db_strerror(ret));
+		return (ret);
+	}
+
+	/* Read the data. */
+	if ((ret = __os_read(dbenv, logc->c_fh, p, *nrp, nrp)) != 0) {
+		if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+			__db_err(dbenv,
+			    "DB_LOGC->get: read: %s", db_strerror(ret));
+		return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __log_c_shortread --
+ *	Read was short -- return a consistent error message and error.
+ */
+static int
+__log_c_shortread(logc, silent)
+	DB_LOGC *logc;
+	int silent;
+{
+	if (!silent || !F_ISSET(logc, DB_LOG_SILENT_ERR))
+		__db_err(logc->dbenv, "DB_LOGC->get: short read");
+	return (EIO);
+}
+
+/*
+ * __log_c_set_maxrec --
+ *	Bound the maximum log record size in a log file.
+ */
+static int
+__log_c_set_maxrec(logc, np)
+	DB_LOGC *logc;
+	char *np;
+{
+	DB_ENV *dbenv;
+	DB_LOG *dblp;
+	LOG *lp;
+	u_int32_t mbytes, bytes;
+	int ret;
+
+	dbenv = logc->dbenv;
+	dblp = dbenv->lg_handle;
+
+	/*
+	 * We don't want to try and allocate huge chunks of memory because
+	 * applications with error-checking malloc's often consider that a
+	 * hard failure.  If we're about to look at a corrupted record with
+	 * a bizarre size, we need to know before trying to allocate space
+	 * to hold it.  We could read the persistent data at the beginning
+	 * of the file but that's hard -- we may have to decrypt it, checksum
+	 * it and so on.  Stat the file instead.
+	 */
+	if ((ret =
+	    __os_ioinfo(dbenv, np, logc->c_fh, &mbytes, &bytes, NULL)) != 0)
+		return (ret);
+
+	logc->bp_maxrec = mbytes * MEGABYTE + bytes;
+
+	/*
+	 * If reading from the log file currently being written, we could get
+	 * an incorrect size, that is, if the cursor was opened on the file
+	 * when it had only a few hundred bytes, and then the cursor used to
+	 * move forward in the file, after more log records were written, the
+	 * original stat value would be wrong.  Use the maximum of the current
+	 * log file size and the size of the buffer -- that should represent
+	 * the max of any log record currently in the file.
+	 *
+	 * The log buffer size is set when the environment is opened and never
+	 * changed, we don't need a lock on it.
+	 */
+	lp = dblp->reginfo.primary;
+	logc->bp_maxrec += lp->buffer_size;
+
+	return (0);
 }
diff --git a/bdb/log/log_method.c b/bdb/log/log_method.c
index 883f485d891..42adaf11c6c 100644
--- a/bdb/log/log_method.c
+++ b/bdb/log/log_method.c
@@ -1,38 +1,39 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1999, 2000
+ * Copyright (c) 1999-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: log_method.c,v 11.14 2000/11/30 00:58:40 ubell Exp $";
+static const char revid[] = "$Id: log_method.c,v 11.32 2002/05/30 22:16:47 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
 #include <sys/types.h>
 
+#ifdef HAVE_RPC
+#include <rpc/rpc.h>
+#endif
+
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "log.h"
+#include "dbinc/log.h"
 
 #ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
+#include "dbinc_auto/db_server.h"
+#include "dbinc_auto/rpc_client_ext.h"
 #endif
 
-static int __log_set_lg_max __P((DB_ENV *, u_int32_t));
 static int __log_set_lg_bsize __P((DB_ENV *, u_int32_t));
 static int __log_set_lg_dir __P((DB_ENV *, const char *));
+static int __log_set_lg_max __P((DB_ENV *, u_int32_t));
+static int __log_set_lg_regionmax __P((DB_ENV *, u_int32_t));
 
 /*
  * __log_dbenv_create --
@@ -44,13 +45,16 @@ void
 __log_dbenv_create(dbenv)
 	DB_ENV *dbenv;
 {
-	dbenv->lg_bsize = LG_BSIZE_DEFAULT;
-	dbenv->set_lg_bsize = __log_set_lg_bsize;
+	/*
+	 * !!!
+	 * Our caller has not yet had the opportunity to reset the panic
+	 * state or turn off mutex locking, and so we can neither check
+	 * the panic state or acquire a mutex in the DB_ENV create path.
+	 */
 
-	dbenv->lg_max = LG_MAX_DEFAULT;
-	dbenv->set_lg_max = __log_set_lg_max;
+	dbenv->lg_bsize = LG_BSIZE_DEFAULT;
+	dbenv->lg_regionmax = LG_BASE_REGION_SIZE;
 
-	dbenv->set_lg_dir = __log_set_lg_dir;
 #ifdef	HAVE_RPC
 	/*
 	 * If we have a client, overwrite what we just setup to
@@ -58,10 +62,29 @@ __log_dbenv_create(dbenv)
 	 */
 	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) {
 		dbenv->set_lg_bsize = __dbcl_set_lg_bsize;
-		dbenv->set_lg_max = __dbcl_set_lg_max;
 		dbenv->set_lg_dir = __dbcl_set_lg_dir;
-	}
+		dbenv->set_lg_max = __dbcl_set_lg_max;
+		dbenv->set_lg_regionmax = __dbcl_set_lg_regionmax;
+		dbenv->log_archive = __dbcl_log_archive;
+		dbenv->log_cursor = __dbcl_log_cursor;
+		dbenv->log_file = __dbcl_log_file;
+		dbenv->log_flush = __dbcl_log_flush;
+		dbenv->log_put = __dbcl_log_put;
+		dbenv->log_stat = __dbcl_log_stat;
+	} else
 #endif
+	{
+		dbenv->set_lg_bsize = __log_set_lg_bsize;
+		dbenv->set_lg_dir = __log_set_lg_dir;
+		dbenv->set_lg_max = __log_set_lg_max;
+		dbenv->set_lg_regionmax = __log_set_lg_regionmax;
+		dbenv->log_archive = __log_archive;
+		dbenv->log_cursor = __log_cursor;
+		dbenv->log_file = __log_file;
+		dbenv->log_flush = __log_flush;
+		dbenv->log_put = __log_put;
+		dbenv->log_stat = __log_stat;
+	}
 }
 
 /*
@@ -73,10 +96,16 @@ __log_set_lg_bsize(dbenv, lg_bsize)
 	DB_ENV *dbenv;
 	u_int32_t lg_bsize;
 {
+	u_int32_t lg_max;
+
 	ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_lg_bsize");
 
+	if (lg_bsize == 0)
+		lg_bsize = LG_BSIZE_DEFAULT;
+
 					/* Let's not be silly. */
-	if (lg_bsize > dbenv->lg_max / 4) {
+	lg_max = dbenv->lg_size == 0 ? LG_MAX_DEFAULT : dbenv->lg_size;
+	if (lg_bsize > lg_max / 4) {
 		__db_err(dbenv, "log buffer size must be <= log file size / 4");
 		return (EINVAL);
 	}
@@ -94,15 +123,53 @@ __log_set_lg_max(dbenv, lg_max)
 	DB_ENV *dbenv;
 	u_int32_t lg_max;
 {
-	ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_lg_max");
+	LOG *region;
+
+	if (lg_max == 0)
+		lg_max = LG_MAX_DEFAULT;
+
+	if (F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) {
+		if (!LOGGING_ON(dbenv))
+			return (__db_env_config(
+			    dbenv, "set_lg_max", DB_INIT_LOG));
+		region = ((DB_LOG *)dbenv->lg_handle)->reginfo.primary;
+
+					/* Let's not be silly. */
+		if (lg_max < region->buffer_size * 4)
+			goto err;
+		region->log_nsize = lg_max;
+	} else {
+					/* Let's not be silly. */
+		if (lg_max < dbenv->lg_bsize * 4)
+			goto err;
+		dbenv->lg_size = lg_max;
+	}
+
+	return (0);
+
+err:	__db_err(dbenv, "log file size must be >= log buffer size * 4");
+	return (EINVAL);
+}
+
+/*
+ * __log_set_lg_regionmax --
+ *	Set the region size.
+ */
+static int
+__log_set_lg_regionmax(dbenv, lg_regionmax)
+	DB_ENV *dbenv;
+	u_int32_t lg_regionmax;
+{
+	ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_lg_regionmax");
 
 					/* Let's not be silly. */
-	if (lg_max < dbenv->lg_bsize * 4) {
-		__db_err(dbenv, "log file size must be >= log buffer size * 4");
+	if (lg_regionmax != 0 && lg_regionmax < LG_BASE_REGION_SIZE) {
+		__db_err(dbenv,
+		    "log file size must be >= %d", LG_BASE_REGION_SIZE);
 		return (EINVAL);
 	}
 
-	dbenv->lg_max = lg_max;
+	dbenv->lg_regionmax = lg_regionmax;
 	return (0);
 }
 
@@ -116,6 +183,6 @@ __log_set_lg_dir(dbenv, dir)
 	const char *dir;
 {
 	if (dbenv->db_log_dir != NULL)
-		__os_freestr(dbenv->db_log_dir);
+		__os_free(dbenv, dbenv->db_log_dir);
 	return (__os_strdup(dbenv, dir, &dbenv->db_log_dir));
 }
diff --git a/bdb/log/log_put.c b/bdb/log/log_put.c
index c61f53e6c3d..bf6de2b0f7b 100644
--- a/bdb/log/log_put.c
+++ b/bdb/log/log_put.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: log_put.c,v 11.26 2000/11/30 00:58:40 ubell Exp $";
+static const char revid[] = "$Id: log_put.c,v 11.112 2002/09/10 02:39:26 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -29,109 +29,424 @@ static const char revid[] = "$Id: log_put.c,v 11.26 2000/11/30 00:58:40 ubell Ex
 #include <unistd.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_page.h"
-#include "log.h"
-#include "hash.h"
-#include "clib_ext.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+#include "dbinc/rep.h"
+#include "dbinc/txn.h"
 
+static int __log_encrypt_record __P((DB_ENV *, DBT *, HDR *, u_int32_t));
 static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t));
-static int __log_flush __P((DB_LOG *, const DB_LSN *));
+static int __log_flush_commit __P((DB_ENV *, const DB_LSN *, u_int32_t));
+static int __log_flush_int __P((DB_LOG *, const DB_LSN *, int));
 static int __log_newfh __P((DB_LOG *));
-static int __log_putr __P((DB_LOG *, DB_LSN *, const DBT *, u_int32_t));
-static int __log_open_files __P((DB_ENV *));
+static int __log_put_next __P((DB_ENV *,
+    DB_LSN *, const DBT *, HDR *, DB_LSN *));
+static int __log_putr __P((DB_LOG *,
+    DB_LSN *, const DBT *, u_int32_t, HDR *));
 static int __log_write __P((DB_LOG *, void *, u_int32_t));
 
 /*
- * log_put --
- *	Write a log record.
+ * __log_put --
+ *	Write a log record.  This is the public interface, DB_ENV->log_put.
+ *
+ * PUBLIC: int __log_put __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
  */
 int
-log_put(dbenv, lsn, dbt, flags)
+__log_put(dbenv, lsnp, udbt, flags)
 	DB_ENV *dbenv;
-	DB_LSN *lsn;
-	const DBT *dbt;
+	DB_LSN *lsnp;
+	const DBT *udbt;
 	u_int32_t flags;
 {
+	DB_CIPHER *db_cipher;
+	DBT *dbt, t;
 	DB_LOG *dblp;
-	int ret;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_log_put(dbenv, lsn, dbt, flags));
-#endif
+	DB_LSN lsn, old_lsn;
+	HDR hdr;
+	LOG *lp;
+	u_int32_t do_flush, op, writeonly;
+	int lock_held, need_free, ret;
+	u_int8_t *key;
 
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->lg_handle, "DB_ENV->log_put", DB_INIT_LOG);
 
 	/* Validate arguments. */
-	if (flags != 0 && flags != DB_CHECKPOINT &&
-	    flags != DB_CURLSN && flags != DB_FLUSH)
-		return (__db_ferr(dbenv, "log_put", 0));
+	op = DB_OPFLAGS_MASK & flags;
+	if (op != 0 && op != DB_COMMIT)
+		return (__db_ferr(dbenv, "DB_ENV->log_put", 0));
+
+	/* Check for allowed bit-flags. */
+	if (LF_ISSET(~(DB_OPFLAGS_MASK |
+	    DB_FLUSH | DB_NOCOPY | DB_PERMANENT | DB_WRNOSYNC)))
+		return (__db_ferr(dbenv, "DB_ENV->log_put", 0));
+
+	/* DB_WRNOSYNC and DB_FLUSH are mutually exclusive. */
+	if (LF_ISSET(DB_WRNOSYNC) && LF_ISSET(DB_FLUSH))
+		return (__db_ferr(dbenv, "DB_ENV->log_put", 1));
+
+	/* Replication clients should never write log records. */
+	if (F_ISSET(dbenv, DB_ENV_REP_CLIENT) ||
+	    F_ISSET(dbenv, DB_ENV_REP_LOGSONLY)) {
+		__db_err(dbenv,
+		    "DB_ENV->log_put is illegal on replication clients");
+		return (EINVAL);
+	}
 
 	dblp = dbenv->lg_handle;
+	lp = dblp->reginfo.primary;
+	db_cipher = dbenv->crypto_handle;
+	dbt = &t;
+	t = *udbt;
+	lock_held = need_free = 0;
+	do_flush = LF_ISSET(DB_FLUSH);
+	writeonly = LF_ISSET(DB_WRNOSYNC);
+
+	/*
+	 * If we are coming from the logging code, we use an internal
+	 * flag, DB_NOCOPY, because we know we can overwrite/encrypt
+	 * the log record in place.  Otherwise, if a user called log_put
+	 * then we must copy it to new memory so that we know we can
+	 * write it.
+	 *
+	 * We also must copy it to new memory if we are a replication
+	 * master so that we retain an unencrypted copy of the log
+	 * record to send to clients.
+	 */
+	if (!LF_ISSET(DB_NOCOPY) || F_ISSET(dbenv, DB_ENV_REP_MASTER)) {
+		if (CRYPTO_ON(dbenv))
+			t.size += db_cipher->adj_size(udbt->size);
+		if ((ret = __os_calloc(dbenv, 1, t.size, &t.data)) != 0)
+			goto err;
+		need_free = 1;
+		memcpy(t.data, udbt->data, udbt->size);
+	}
+	if ((ret = __log_encrypt_record(dbenv, dbt, &hdr, udbt->size)) != 0)
+		goto err;
+	if (CRYPTO_ON(dbenv))
+		key = db_cipher->mac_key;
+	else
+		key = NULL;
+	/* Otherwise, we actually have a record to put.  Put it. */
+
+	/* Before we grab the region lock, calculate the record's checksum. */
+	__db_chksum(dbt->data, dbt->size, key, hdr.chksum);
+
 	R_LOCK(dbenv, &dblp->reginfo);
-	ret = __log_put(dbenv, lsn, dbt, flags);
-	R_UNLOCK(dbenv, &dblp->reginfo);
+	lock_held = 1;
+
+	ZERO_LSN(old_lsn);
+	if ((ret = __log_put_next(dbenv, &lsn, dbt, &hdr, &old_lsn)) != 0)
+		goto err;
+
+	if (F_ISSET(dbenv, DB_ENV_REP_MASTER)) {
+		/*
+		 * Replication masters need to drop the lock to send
+		 * messages, but we want to drop and reacquire it a minimal
+		 * number of times.
+		 */
+		R_UNLOCK(dbenv, &dblp->reginfo);
+		lock_held = 0;
+
+		/*
+		 * If we changed files and we're in a replicated
+		 * environment, we need to inform our clients now that
+		 * we've dropped the region lock.
+		 *
+		 * Note that a failed NEWFILE send is a dropped message
+		 * that our client can handle, so we can ignore it.  It's
+		 * possible that the record we already put is a commit, so
+		 * we don't just want to return failure.
+		 */
+		if (!IS_ZERO_LSN(old_lsn))
+			(void)__rep_send_message(dbenv,
+			    DB_EID_BROADCAST, REP_NEWFILE, &old_lsn, NULL, 0);
+
+		/*
+		 * Then send the log record itself on to our clients.
+		 *
+		 * If the send fails and we're a commit or checkpoint,
+		 * there's nothing we can do;  the record's in the log.
+		 * Flush it, even if we're running with TXN_NOSYNC, on the
+		 * grounds that it should be in durable form somewhere.
+		 */
+		/*
+		 * !!!
+		 * In the crypto case, we MUST send the udbt, not the
+		 * now-encrypted dbt.  Clients have no way to decrypt
+		 * without the header.
+		 */
+		if ((__rep_send_message(dbenv,
+		    DB_EID_BROADCAST, REP_LOG, &lsn, udbt, flags) != 0) &&
+		    LF_ISSET(DB_PERMANENT))
+			do_flush |= DB_FLUSH;
+	}
+
+	/*
+	 * If needed, do a flush.  Note that failures at this point
+	 * are only permissible if we know we haven't written a commit
+	 * record;  __log_flush_commit is responsible for enforcing this.
+	 *
+	 * If a flush is not needed, see if WRITE_NOSYNC was set and we
+	 * need to write out the log buffer.
+	 */
+	if (do_flush || writeonly) {
+		if (!lock_held) {
+			R_LOCK(dbenv, &dblp->reginfo);
+			lock_held = 1;
+		}
+		if (do_flush)
+			ret = __log_flush_commit(dbenv, &lsn, flags);
+		else if (lp->b_off != 0)
+			/*
+			 * writeonly: if there's anything in the current
+			 * log buffer, we need to write it out.
+			 */
+			if ((ret = __log_write(dblp,
+			    dblp->bufp, (u_int32_t)lp->b_off)) == 0)
+				lp->b_off = 0;
+	}
+
+err:	if (lock_held)
+		R_UNLOCK(dbenv, &dblp->reginfo);
+	if (need_free)
+		__os_free(dbenv, dbt->data);
+
+	if (ret == 0)
+		*lsnp = lsn;
+
 	return (ret);
 }
 
 /*
- * __log_put --
- *	Write a log record; internal version.
+ * __log_txn_lsn --
  *
- * PUBLIC: int __log_put __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
+ * PUBLIC: void __log_txn_lsn
+ * PUBLIC:     __P((DB_ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
  */
-int
-__log_put(dbenv, lsn, dbt, flags)
+void
+__log_txn_lsn(dbenv, lsnp, mbytesp, bytesp)
+	DB_ENV *dbenv;
+	DB_LSN *lsnp;
+	u_int32_t *mbytesp, *bytesp;
+{
+	DB_LOG *dblp;
+	LOG *lp;
+
+	dblp = dbenv->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	R_LOCK(dbenv, &dblp->reginfo);
+
+	/*
+	 * We are trying to get the LSN of the last entry in the log.  We use
+	 * this in two places: 1) DB_ENV->txn_checkpiont uses it as a first
+	 * value when trying to compute an LSN such that all transactions begun
+	 * before it are complete.   2) DB_ENV->txn_begin uses it as the
+	 * begin_lsn.
+	 *
+	 * Typically, it's easy to get the last written LSN, you simply look
+	 * at the current log pointer and back up the number of bytes of the
+	 * last log record.  However, if the last thing we did was write the
+	 * log header of a new log file, then, this doesn't work, so we return
+	 * the first log record that will be written in this new file.
+	 */
+	*lsnp = lp->lsn;
+	if (lp->lsn.offset > lp->len)
+		lsnp->offset -= lp->len;
+
+	/*
+	 * Since we're holding the log region lock, return the bytes put into
+	 * the log since the last checkpoint, transaction checkpoint needs it.
+	 *
+	 * We add the current buffer offset so as to count bytes that have not
+	 * yet been written, but are sitting in the log buffer.
+	 */
+	if (mbytesp != NULL) {
+		*mbytesp = lp->stat.st_wc_mbytes;
+		*bytesp = (u_int32_t)(lp->stat.st_wc_bytes + lp->b_off);
+	}
+
+	R_UNLOCK(dbenv, &dblp->reginfo);
+}
+
+/*
+ * __log_put_next --
+ *	Put the given record as the next in the log, wherever that may
+ * turn out to be.
+ */
+static int
+__log_put_next(dbenv, lsn, dbt, hdr, old_lsnp)
 	DB_ENV *dbenv;
 	DB_LSN *lsn;
 	const DBT *dbt;
-	u_int32_t flags;
+	HDR *hdr;
+	DB_LSN *old_lsnp;
 {
-	DBT t;
 	DB_LOG *dblp;
+	DB_LSN old_lsn;
 	LOG *lp;
-	u_int32_t lastoff;
-	int ret;
+	int newfile, ret;
 
 	dblp = dbenv->lg_handle;
 	lp = dblp->reginfo.primary;
 
 	/*
-	 * If the application just wants to know where we are, fill in
-	 * the information.  Currently used by the transaction manager
-	 * to avoid writing TXN_begin records.
+	 * Save a copy of lp->lsn before we might decide to switch log
+	 * files and change it.  If we do switch log files, and we're
+	 * doing replication, we'll need to tell our clients about the
+	 * switch, and they need to receive a NEWFILE message
+	 * with this "would-be" LSN in order to know they're not
+	 * missing any log records.
 	 */
-	if (flags == DB_CURLSN) {
-		lsn->file = lp->lsn.file;
-		lsn->offset = lp->lsn.offset;
-		return (0);
-	}
+	old_lsn = lp->lsn;
+	newfile = 0;
 
-	/* If this information won't fit in the file, swap files. */
-	if (lp->lsn.offset + sizeof(HDR) + dbt->size > lp->persist.lg_max) {
-		if (sizeof(HDR) +
-		    sizeof(LOGP) + dbt->size > lp->persist.lg_max) {
+	/*
+	 * If this information won't fit in the file, or if we're a
+	 * replication client environment and have been told to do so,
+	 * swap files.
+	 */
+	if (lp->lsn.offset == 0 ||
+	    lp->lsn.offset + hdr->size + dbt->size > lp->log_size) {
+		if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_size) {
 			__db_err(dbenv,
-			    "log_put: record larger than maximum file size");
+		    "DB_ENV->log_put: record larger than maximum file size");
 			return (EINVAL);
 		}
 
-		/* Flush the log. */
-		if ((ret = __log_flush(dblp, NULL)) != 0)
+		if ((ret = __log_newfile(dblp, NULL)) != 0)
 			return (ret);
 
 		/*
+		 * Flag that we switched files, in case we're a master
+		 * and need to send this information to our clients.
+		 * We postpone doing the actual send until we can
+		 * safely release the log region lock and are doing so
+		 * anyway.
+		 */
+		newfile = 1;
+		
+		if (dbenv->db_noticecall != NULL)
+			dbenv->db_noticecall(dbenv, DB_NOTICE_LOGFILE_CHANGED);
+	}
+
+	/*
+	 * The offset into the log file at this point is the LSN where
+	 * we're about to put this record, and is the LSN the caller wants.
+	 */
+	*lsn = lp->lsn;
+
+	/* If we switched log files, let our caller know where. */
+	if (newfile)
+		*old_lsnp = old_lsn;
+
+	/* Actually put the record. */
+	return (__log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len, hdr));
+}
+
+/*
+ * __log_flush_commit --
+ *	Flush a record for which the DB_FLUSH flag to log_put has been set.
+ */
+static int
+__log_flush_commit(dbenv, lsnp, flags)
+	DB_ENV *dbenv;
+	const DB_LSN *lsnp;
+	u_int32_t flags;
+{
+	DB_LOG *dblp;
+	DB_LSN flush_lsn;
+	LOG *lp;
+	int ret;
+	u_int32_t op;
+
+	dblp = dbenv->lg_handle;
+	lp = dblp->reginfo.primary;
+	flush_lsn = *lsnp;
+	op = DB_OPFLAGS_MASK & flags;
+
+	if ((ret = __log_flush_int(dblp, &flush_lsn, 1)) == 0)
+		return (0);
+
+	/*
+	 * If a flush supporting a transaction commit fails, we must abort the
+	 * transaction.  (If we aren't doing a commit, return the failure; if
+	 * if the commit we care about made it to disk successfully, we just
+	 * ignore the failure, because there's no way to undo the commit.)
+	 */
+	if (op != DB_COMMIT)
+		return (ret);
+
+	if (flush_lsn.file != lp->lsn.file || flush_lsn.offset < lp->w_off)
+		return (0);
+
+	/*
+	 * Else, make sure that the commit record does not get out after we
+	 * abort the transaction.  Do this by overwriting the commit record
+	 * in the buffer.  (Note that other commits in this buffer will wait
+	 * wait until a sucessful write happens, we do not wake them.)  We
+	 * point at the right part of the buffer and write an abort record
+	 * over the commit.  We must then try and flush the buffer again,
+	 * since the interesting part of the buffer may have actually made
+	 * it out to disk before there was a failure, we can't know for sure.
+	 */
+	if (__txn_force_abort(dbenv,
+	    dblp->bufp + flush_lsn.offset - lp->w_off) == 0)
+		(void)__log_flush_int(dblp, &flush_lsn, 0);
+
+	return (ret);
+}
+
+/*
+ * __log_newfile --
+ *	Initialize and switch to a new log file.  (Note that this is
+ * called both when no log yet exists and when we fill a log file.)
+ *
+ * PUBLIC: int __log_newfile __P((DB_LOG *, DB_LSN *));
+ */
+int
+__log_newfile(dblp, lsnp)
+	DB_LOG *dblp;
+	DB_LSN *lsnp;
+{
+	DB_CIPHER *db_cipher;
+	DB_ENV *dbenv;
+	DB_LSN lsn;
+	DBT t;
+	HDR hdr;
+	LOG *lp;
+	int need_free, ret;
+	u_int32_t lastoff;
+	size_t tsize;
+	u_int8_t *tmp;
+
+	dbenv = dblp->dbenv;
+	lp = dblp->reginfo.primary;
+
+	/* If we're not at the beginning of a file already, start a new one. */
+	if (lp->lsn.offset != 0) {
+		/*
+		 * Flush the log so this file is out and can be closed.  We
+		 * cannot release the region lock here because we need to
+		 * protect the end of the file while we switch.  In
+		 * particular, a thread with a smaller record than ours
+		 * could detect that there is space in the log. Even
+		 * blocking that event by declaring the file full would
+		 * require all threads to wait here so that the lsn.file
+		 * can be moved ahead after the flush completes.  This
+		 * probably can be changed if we had an lsn for the
+		 * previous file and one for the curent, but it does not
+		 * seem like this would get much more throughput, if any.
+		 */
+		if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
+			return (ret);
+
+		DB_ASSERT(lp->b_off == 0);
+		/*
 		 * Save the last known offset from the previous file, we'll
 		 * need it to initialize the persistent header information.
 		 */
@@ -143,78 +458,50 @@ __log_put(dbenv, lsn, dbt, flags)
 
 		/* Reset the file write offset. */
 		lp->w_off = 0;
-
-		if (dbenv->db_noticecall != NULL)
-			dbenv->db_noticecall(dbenv, DB_NOTICE_LOGFILE_CHANGED);
 	} else
 		lastoff = 0;
 
-	/* Initialize the LSN information returned to the user. */
-	lsn->file = lp->lsn.file;
-	lsn->offset = lp->lsn.offset;
-
 	/*
 	 * Insert persistent information as the first record in every file.
 	 * Note that the previous length is wrong for the very first record
 	 * of the log, but that's okay, we check for it during retrieval.
 	 */
-	if (lp->lsn.offset == 0) {
-		t.data = &lp->persist;
-		t.size = sizeof(LOGP);
-		if ((ret = __log_putr(dblp, lsn,
-		    &t, lastoff == 0 ? 0 : lastoff - lp->len)) != 0)
-			return (ret);
+	DB_ASSERT(lp->b_off == 0);
 
-		/*
-		 * Record files open in this log.
-		 * If we are recovering then we are in the
-		 * process of outputting the files, don't do
-		 * it again.
-		 */
-		if (!F_ISSET(dblp, DBLOG_RECOVER) &&
-		    (ret = __log_open_files(dbenv)) != 0)
-			return (ret);
-
-		/* Update the LSN information returned to the user. */
-		lsn->file = lp->lsn.file;
-		lsn->offset = lp->lsn.offset;
-	}
+	memset(&t, 0, sizeof(t));
+	memset(&hdr, 0, sizeof(HDR));
 
-	/* Write the application's log record. */
-	if ((ret = __log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len)) != 0)
+	need_free = 0;
+	tsize = sizeof(LOGP);
+	db_cipher = dbenv->crypto_handle;
+	if (CRYPTO_ON(dbenv))
+		tsize += db_cipher->adj_size(tsize);
+	if ((ret = __os_calloc(dbenv, 1, tsize, &tmp)) != 0)
 		return (ret);
+	lp->persist.log_size = lp->log_size = lp->log_nsize;
+	memcpy(tmp, &lp->persist, sizeof(LOGP));
+	t.data = tmp;
+	t.size = (u_int32_t)tsize;
+	need_free = 1;
 
-	/*
-	 * On a checkpoint, we:
-	 *	Put out the checkpoint record (above).
-	 *	Save the LSN of the checkpoint in the shared region.
-	 *	Append the set of file name information into the log.
-	 */
-	if (flags == DB_CHECKPOINT) {
-		lp->chkpt_lsn = *lsn;
-		if ((ret = __log_open_files(dbenv)) != 0)
-			return (ret);
-	}
+	if ((ret =
+	    __log_encrypt_record(dbenv, &t, &hdr, (u_int32_t)tsize)) != 0)
+		goto err;
+	__db_chksum(t.data, t.size,
+	    (CRYPTO_ON(dbenv)) ? db_cipher->mac_key : NULL, hdr.chksum);
+	lsn = lp->lsn;
+	if ((ret = __log_putr(dblp, &lsn,
+	    &t, lastoff == 0 ? 0 : lastoff - lp->len, &hdr)) != 0)
+		goto err;
 
-	/*
-	 * On a checkpoint or when flush is requested, we:
-	 *	Flush the current buffer contents to disk.
-	 *	Sync the log to disk.
-	 */
-	if (flags == DB_FLUSH || flags == DB_CHECKPOINT)
-		if ((ret = __log_flush(dblp, NULL)) != 0)
-			return (ret);
+	/* Update the LSN information returned to the caller. */
+	if (lsnp != NULL)
+		*lsnp = lp->lsn;
 
-	/*
-	 * On a checkpoint, we:
-	 *	Save the time the checkpoint was written.
-	 *	Reset the bytes written since the last checkpoint.
-	 */
-	if (flags == DB_CHECKPOINT) {
-		(void)time(&lp->chkpt);
-		lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
-	}
-	return (0);
+err:
+	if (need_free)
+		__os_free(dbenv, tmp);
+	return (ret);
 }
 
 /*
@@ -222,100 +509,253 @@ __log_put(dbenv, lsn, dbt, flags)
  *	Actually put a record into the log.
  */
 static int
-__log_putr(dblp, lsn, dbt, prev)
+__log_putr(dblp, lsn, dbt, prev, h)
 	DB_LOG *dblp;
 	DB_LSN *lsn;
 	const DBT *dbt;
 	u_int32_t prev;
+	HDR *h;
 {
-	HDR hdr;
+	DB_CIPHER *db_cipher;
+	DB_ENV *dbenv;
+	DB_LSN f_lsn;
 	LOG *lp;
-	int ret;
+	HDR tmp, *hdr;
+	int ret, t_ret;
+	size_t b_off, nr;
+	u_int32_t w_off;
 
+	dbenv = dblp->dbenv;
 	lp = dblp->reginfo.primary;
 
 	/*
+	 * If we weren't given a header, use a local one.
+	 */
+	db_cipher = dbenv->crypto_handle;
+	if (h == NULL) {
+		hdr = &tmp;
+		memset(hdr, 0, sizeof(HDR));
+		if (CRYPTO_ON(dbenv))
+			hdr->size = HDR_CRYPTO_SZ;
+		else
+			hdr->size = HDR_NORMAL_SZ;
+	} else
+		hdr = h;
+
+	/* Save our position in case we fail. */
+	b_off = lp->b_off;
+	w_off = lp->w_off;
+	f_lsn = lp->f_lsn;
+
+	/*
 	 * Initialize the header.  If we just switched files, lsn.offset will
 	 * be 0, and what we really want is the offset of the previous record
 	 * in the previous file.  Fortunately, prev holds the value we want.
 	 */
-	hdr.prev = prev;
-	hdr.len = sizeof(HDR) + dbt->size;
-	hdr.cksum = __ham_func4(NULL, dbt->data, dbt->size);
+	hdr->prev = prev;
+	hdr->len = (u_int32_t)hdr->size + dbt->size;
 
-	if ((ret = __log_fill(dblp, lsn, &hdr, sizeof(HDR))) != 0)
-		return (ret);
-	lp->len = sizeof(HDR);
-	lp->lsn.offset += sizeof(HDR);
+	/*
+	 * If we were passed in a nonzero checksum, our caller calculated
+	 * the checksum before acquiring the log mutex, as an optimization.
+	 *
+	 * If our caller calculated a real checksum of 0, we'll needlessly
+	 * recalculate it.  C'est la vie;  there's no out-of-bounds value
+	 * here.
+	 */
+	if (hdr->chksum[0] == 0)
+		__db_chksum(dbt->data, dbt->size,
+		    (CRYPTO_ON(dbenv)) ? db_cipher->mac_key : NULL,
+		    hdr->chksum);
+
+	if ((ret = __log_fill(dblp, lsn, hdr, (u_int32_t)hdr->size)) != 0)
+		goto err;
 
 	if ((ret = __log_fill(dblp, lsn, dbt->data, dbt->size)) != 0)
-		return (ret);
-	lp->len += dbt->size;
-	lp->lsn.offset += dbt->size;
+		goto err;
+
+	lp->len = (u_int32_t)(hdr->size + dbt->size);
+	lp->lsn.offset += (u_int32_t)(hdr->size + dbt->size);
 	return (0);
+err:
+	/*
+	 * If we wrote more than one buffer before failing, get the
+	 * first one back.  The extra buffers will fail the checksums
+	 * and be ignored.
+	 */
+	if (w_off + lp->buffer_size < lp->w_off) {
+		if ((t_ret =
+		    __os_seek(dbenv,
+		    &dblp->lfh, 0, 0, w_off, 0, DB_OS_SEEK_SET)) != 0 ||
+		    (t_ret = __os_read(dbenv, &dblp->lfh, dblp->bufp,
+		    b_off, &nr)) != 0)
+			return (__db_panic(dbenv, t_ret));
+		if (nr != b_off) {
+			__db_err(dbenv, "Short read while restoring log");
+			return (__db_panic(dbenv, EIO));
+		}
+	}
+
+	/* Reset to where we started. */
+	lp->w_off = w_off;
+	lp->b_off = b_off;
+	lp->f_lsn = f_lsn;
+
+	return (ret);
 }
 
 /*
- * log_flush --
+ * __log_flush --
  *	Write all records less than or equal to the specified LSN.
+ *
+ * PUBLIC: int __log_flush __P((DB_ENV *, const DB_LSN *));
  */
 int
-log_flush(dbenv, lsn)
+__log_flush(dbenv, lsn)
 	DB_ENV *dbenv;
 	const DB_LSN *lsn;
 {
 	DB_LOG *dblp;
 	int ret;
 
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_log_flush(dbenv, lsn));
-#endif
-
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->lg_handle, "DB_ENV->log_flush", DB_INIT_LOG);
 
 	dblp = dbenv->lg_handle;
 	R_LOCK(dbenv, &dblp->reginfo);
-	ret = __log_flush(dblp, lsn);
+	ret = __log_flush_int(dblp, lsn, 1);
 	R_UNLOCK(dbenv, &dblp->reginfo);
 	return (ret);
 }
 
 /*
- * __log_flush --
+ * __log_flush_int --
  *	Write all records less than or equal to the specified LSN; internal
  *	version.
  */
 static int
-__log_flush(dblp, lsn)
+__log_flush_int(dblp, lsnp, release)
 	DB_LOG *dblp;
-	const DB_LSN *lsn;
+	const DB_LSN *lsnp;
+	int release;
 {
-	DB_LSN t_lsn;
+	DB_ENV *dbenv;
+	DB_LSN flush_lsn, f_lsn;
+	DB_MUTEX *flush_mutexp;
 	LOG *lp;
-	int current, ret;
+	int current, do_flush, first, ret;
+	size_t b_off;
+	struct __db_commit *commit;
+	u_int32_t ncommit, w_off;
 
 	ret = 0;
+	ncommit = 0;
+	dbenv = dblp->dbenv;
 	lp = dblp->reginfo.primary;
+	flush_mutexp = R_ADDR(&dblp->reginfo, lp->flush_mutex_off);
 
 	/*
 	 * If no LSN specified, flush the entire log by setting the flush LSN
 	 * to the last LSN written in the log.  Otherwise, check that the LSN
 	 * isn't a non-existent record for the log.
 	 */
-	if (lsn == NULL) {
-		t_lsn.file = lp->lsn.file;
-		t_lsn.offset = lp->lsn.offset - lp->len;
-		lsn = &t_lsn;
-	} else
-		if (lsn->file > lp->lsn.file ||
-		    (lsn->file == lp->lsn.file &&
-		    lsn->offset > lp->lsn.offset - lp->len)) {
-			__db_err(dblp->dbenv,
-			    "log_flush: LSN past current end-of-log");
-			return (EINVAL);
-		}
+	if (lsnp == NULL) {
+		flush_lsn.file = lp->lsn.file;
+		flush_lsn.offset = lp->lsn.offset - lp->len;
+	} else if (lsnp->file > lp->lsn.file ||
+	    (lsnp->file == lp->lsn.file &&
+	    lsnp->offset > lp->lsn.offset - lp->len)) {
+		__db_err(dbenv,
+		    "DB_ENV->log_flush: LSN past current end-of-log");
+		return (EINVAL);
+	} else {
+		/*
+		 * See if we need to wait.  s_lsn is not locked so some
+		 * care is needed.  The sync point can only move forward.
+		 * If the file we want is in the past we are done.
+		 * If the file numbers are the same check the offset.
+		 * If this fails check the file numbers again since the
+		 * offset might have changed while we were looking.
+		 * This all assumes we can read an integer in one
+		 * state or the other, not in transition.
+		 */
+		if (lp->s_lsn.file > lsnp->file)
+			return (0);
+
+		if (lp->s_lsn.file == lsnp->file &&
+		    lp->s_lsn.offset > lsnp->offset)
+			return (0);
+
+		if (lp->s_lsn.file > lsnp->file)
+			return (0);
+
+		flush_lsn = *lsnp;
+	}
+
+	/*
+	 * If a flush is in progress and we're allowed to do so, drop
+	 * the region lock and block waiting for the next flush.
+	 */
+	if (release && lp->in_flush != 0) {
+		if ((commit = SH_TAILQ_FIRST(
+		    &lp->free_commits, __db_commit)) == NULL) {
+			if ((ret =
+			    __db_shalloc(dblp->reginfo.addr,
+			    sizeof(struct __db_commit),
+			    MUTEX_ALIGN, &commit)) != 0)
+				goto flush;
+			memset(commit, 0, sizeof(*commit));
+			if ((ret = __db_mutex_setup(dbenv, &dblp->reginfo,
+			    &commit->mutex, MUTEX_SELF_BLOCK |
+			    MUTEX_NO_RLOCK)) != 0) {
+				__db_shalloc_free(dblp->reginfo.addr, commit);
+				return (ret);
+			}
+			MUTEX_LOCK(dbenv, &commit->mutex);
+		} else
+			SH_TAILQ_REMOVE(
+			    &lp->free_commits, commit, links, __db_commit);
+
+		lp->ncommit++;
+
+		/*
+		 * Flushes may be requested out of LSN order;  be
+		 * sure we only move lp->t_lsn forward.
+		 */
+		if (log_compare(&lp->t_lsn, &flush_lsn) < 0)
+			lp->t_lsn = flush_lsn;
+
+		commit->lsn = flush_lsn;
+		SH_TAILQ_INSERT_HEAD(
+		    &lp->commits, commit, links, __db_commit);
+		R_UNLOCK(dbenv, &dblp->reginfo);
+		/* Wait here for the in-progress flush to finish. */
+		MUTEX_LOCK(dbenv, &commit->mutex);
+		R_LOCK(dbenv, &dblp->reginfo);
+
+		lp->ncommit--;
+		/*
+		 * Grab the flag before freeing the struct to see if
+		 * we need to flush the log to commit.  If so,
+		 * use the maximal lsn for any committing thread.
+		 */
+		do_flush = F_ISSET(commit, DB_COMMIT_FLUSH);
+		F_CLR(commit, DB_COMMIT_FLUSH);
+		SH_TAILQ_INSERT_HEAD(
+		    &lp->free_commits, commit, links, __db_commit);
+		if (do_flush) {
+			lp->in_flush--;
+			flush_lsn = lp->t_lsn;
+		} else
+			return (0);
+	}
+
+	/*
+	 * Protect flushing with its own mutex so we can release
+	 * the region lock except during file switches.
+	 */
+flush:	MUTEX_LOCK(dbenv, flush_mutexp);
 
 	/*
 	 * If the LSN is less than or equal to the last-sync'd LSN, we're done.
@@ -323,9 +763,12 @@ __log_flush(dblp, lsn)
 	 * after the byte we absolutely know was written to disk, so the test
 	 * is <, not <=.
 	 */
-	if (lsn->file < lp->s_lsn.file ||
-	    (lsn->file == lp->s_lsn.file && lsn->offset < lp->s_lsn.offset))
-		return (0);
+	if (flush_lsn.file < lp->s_lsn.file ||
+	    (flush_lsn.file == lp->s_lsn.file &&
+	    flush_lsn.offset < lp->s_lsn.offset)) {
+		MUTEX_UNLOCK(dbenv, flush_mutexp);
+		goto done;
+	}
 
 	/*
 	 * We may need to write the current buffer.  We have to write the
@@ -333,9 +776,12 @@ __log_flush(dblp, lsn)
 	 * buffer's starting LSN.
 	 */
 	current = 0;
-	if (lp->b_off != 0 && log_compare(lsn, &lp->f_lsn) >= 0) {
-		if ((ret = __log_write(dblp, dblp->bufp, lp->b_off)) != 0)
-			return (ret);
+	if (lp->b_off != 0 && log_compare(&flush_lsn, &lp->f_lsn) >= 0) {
+		if ((ret = __log_write(dblp,
+		    dblp->bufp, (u_int32_t)lp->b_off)) != 0) {
+			MUTEX_UNLOCK(dbenv, flush_mutexp);
+			goto done;
+		}
 
 		lp->b_off = 0;
 		current = 1;
@@ -348,23 +794,90 @@ __log_flush(dblp, lsn)
 	 * buffer, don't bother.  We have nothing to write and nothing to
 	 * sync.
 	 */
-	if (dblp->lfname != lp->lsn.file) {
-		if (!current)
-			return (0);
-		if ((ret = __log_newfh(dblp)) != 0)
-			return (ret);
-	}
+	if (!F_ISSET(&dblp->lfh, DB_FH_VALID) || dblp->lfname != lp->lsn.file)
+		if (!current || (ret = __log_newfh(dblp)) != 0) {
+			MUTEX_UNLOCK(dbenv, flush_mutexp);
+			goto done;
+		}
+
+	/*
+	 * We are going to flush, release the region.
+	 * First get the current state of the buffer since
+	 * another write may come in, but we may not flush it.
+	 */
+	b_off = lp->b_off;
+	w_off = lp->w_off;
+	f_lsn = lp->f_lsn;
+	lp->in_flush++;
+	if (release)
+		R_UNLOCK(dbenv, &dblp->reginfo);
 
 	/* Sync all writes to disk. */
-	if ((ret = __os_fsync(dblp->dbenv, &dblp->lfh)) != 0)
-		return (__db_panic(dblp->dbenv, ret));
+	if ((ret = __os_fsync(dbenv, &dblp->lfh)) != 0) {
+		MUTEX_UNLOCK(dbenv, flush_mutexp);
+		if (release)
+			R_LOCK(dbenv, &dblp->reginfo);
+		ret = __db_panic(dbenv, ret);
+		return (ret);
+	}
+
+	/*
+	 * Set the last-synced LSN.
+	 * This value must be set to the LSN past the last complete
+	 * record that has been flushed.  This is at least the first
+	 * lsn, f_lsn.  If the buffer is empty, b_off == 0, then
+	 * we can move up to write point since the first lsn is not
+	 * set for the new buffer.
+	 */
+	lp->s_lsn = f_lsn;
+	if (b_off == 0)
+		lp->s_lsn.offset = w_off;
+
+	MUTEX_UNLOCK(dbenv, flush_mutexp);
+	if (release)
+		R_LOCK(dbenv, &dblp->reginfo);
+
+	lp->in_flush--;
 	++lp->stat.st_scount;
 
-	/* Set the last-synced LSN, using the on-disk write offset. */
-	lp->s_lsn.file = lp->f_lsn.file;
-	lp->s_lsn.offset = lp->w_off;
+	/*
+	 * How many flush calls (usually commits) did this call actually sync?
+	 * At least one, if it got here.
+	 */
+	ncommit = 1;
+done:
+	if (lp->ncommit != 0) {
+		first = 1;
+		for (commit = SH_TAILQ_FIRST(&lp->commits, __db_commit);
+		    commit != NULL;
+		    commit = SH_TAILQ_NEXT(commit, links, __db_commit))
+			if (log_compare(&lp->s_lsn, &commit->lsn) > 0) {
+				MUTEX_UNLOCK(dbenv, &commit->mutex);
+				SH_TAILQ_REMOVE(
+				    &lp->commits, commit, links, __db_commit);
+				ncommit++;
+			} else if (first == 1) {
+				F_SET(commit, DB_COMMIT_FLUSH);
+				MUTEX_UNLOCK(dbenv, &commit->mutex);
+				SH_TAILQ_REMOVE(
+				    &lp->commits, commit, links, __db_commit);
+				/*
+				 * This thread will wake and flush.
+				 * If another thread commits and flushes
+				 * first we will waste a trip trough the
+				 * mutex.
+				 */
+				lp->in_flush++;
+				first = 0;
+			}
+	}
+	if (lp->stat.st_maxcommitperflush < ncommit)
+		lp->stat.st_maxcommitperflush = ncommit;
+	if (lp->stat.st_mincommitperflush > ncommit ||
+	    lp->stat.st_mincommitperflush == 0)
+		lp->stat.st_mincommitperflush = ncommit;
 
-	return (0);
+	return (ret);
 }
 
 /*
@@ -415,7 +928,7 @@ __log_fill(dblp, lsn, addr, len)
 		nw = remain > len ? len : remain;
 		memcpy(dblp->bufp + lp->b_off, addr, nw);
 		addr = (u_int8_t *)addr + nw;
-		len -= nw;
+		len -= (u_int32_t)nw;
 		lp->b_off += nw;
 
 		/* If we fill the buffer, flush it. */
@@ -439,15 +952,18 @@ __log_write(dblp, addr, len)
 	void *addr;
 	u_int32_t len;
 {
+	DB_ENV *dbenv;
 	LOG *lp;
 	size_t nw;
 	int ret;
 
+	dbenv = dblp->dbenv;
+	lp = dblp->reginfo.primary;
+
 	/*
 	 * If we haven't opened the log file yet or the current one
 	 * has changed, acquire a new log file.
 	 */
-	lp = dblp->reginfo.primary;
 	if (!F_ISSET(&dblp->lfh, DB_FH_VALID) || dblp->lfname != lp->lsn.file)
 		if ((ret = __log_newfh(dblp)) != 0)
 			return (ret);
@@ -457,14 +973,10 @@ __log_write(dblp, addr, len)
 	 * since we last did).
 	 */
 	if ((ret =
-	    __os_seek(dblp->dbenv,
+	    __os_seek(dbenv,
 	    &dblp->lfh, 0, 0, lp->w_off, 0, DB_OS_SEEK_SET)) != 0 ||
-	    (ret = __os_write(dblp->dbenv, &dblp->lfh, addr, len, &nw)) != 0)
-		return (__db_panic(dblp->dbenv, ret));
-	if (nw != len) {
-		__db_err(dblp->dbenv, "Short write while writing log");
-		return (EIO);
-	}
+	    (ret = __os_write(dbenv, &dblp->lfh, addr, len, &nw)) != 0)
+		return (ret);
 
 	/* Reset the buffer offset and update the seek offset. */
 	lp->w_off += len;
@@ -484,11 +996,13 @@ __log_write(dblp, addr, len)
 }
 
 /*
- * log_file --
+ * __log_file --
  *	Map a DB_LSN to a file name.
+ *
+ * PUBLIC: int __log_file __P((DB_ENV *, const DB_LSN *, char *, size_t));
  */
 int
-log_file(dbenv, lsn, namep, len)
+__log_file(dbenv, lsn, namep, len)
 	DB_ENV *dbenv;
 	const DB_LSN *lsn;
 	char *namep;
@@ -498,13 +1012,9 @@ log_file(dbenv, lsn, namep, len)
 	int ret;
 	char *name;
 
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_log_file(dbenv, lsn, namep, len));
-#endif
-
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->lg_handle, "DB_ENV->log_file", DB_INIT_LOG);
 
 	dblp = dbenv->lg_handle;
 	R_LOCK(dbenv, &dblp->reginfo);
@@ -516,11 +1026,11 @@ log_file(dbenv, lsn, namep, len)
 	/* Check to make sure there's enough room and copy the name. */
 	if (len < strlen(name) + 1) {
 		*namep = '\0';
-		__db_err(dbenv, "log_file: name buffer is too short");
+		__db_err(dbenv, "DB_ENV->log_file: name buffer is too short");
 		return (EINVAL);
 	}
 	(void)strcpy(namep, name);
-	__os_freestr(name);
+	__os_free(dbenv, name);
 
 	return (0);
 }
@@ -533,19 +1043,21 @@ static int
 __log_newfh(dblp)
 	DB_LOG *dblp;
 {
+	DB_ENV *dbenv;
 	LOG *lp;
 	int ret;
 	char *name;
 
+	dbenv = dblp->dbenv;
+	lp = dblp->reginfo.primary;
+
 	/* Close any previous file descriptor. */
 	if (F_ISSET(&dblp->lfh, DB_FH_VALID))
-		(void)__os_closehandle(&dblp->lfh);
-
-	/* Get the path of the new file and open it. */
-	lp = dblp->reginfo.primary;
-	dblp->lfname = lp->lsn.file;
+		(void)__os_closehandle(dbenv, &dblp->lfh);
 
 	/*
+	 * Get the path of the new file and open it.
+	 *
 	 * Adding DB_OSO_LOG to the flags may add additional platform-specific
 	 * optimizations.  On WinNT, the logfile is preallocated, which may
 	 * have a time penalty at startup, but have better overall throughput.
@@ -557,14 +1069,16 @@ __log_newfh(dblp)
 	 * maximum size down into the Windows __os_open routine, because it
 	 * wants to pre-allocate it.
 	 */
-	dblp->lfh.log_size = dblp->dbenv->lg_max;
+	dblp->lfname = lp->lsn.file;
+	dblp->lfh.log_size = lp->log_size;
 	if ((ret = __log_name(dblp, dblp->lfname,
 	    &name, &dblp->lfh,
-	    DB_OSO_CREATE |/* DB_OSO_LOG |*/ DB_OSO_SEQ)) != 0)
-		__db_err(dblp->dbenv,
-		    "log_put: %s: %s", name, db_strerror(ret));
+	    DB_OSO_CREATE |/* DB_OSO_LOG |*/ DB_OSO_SEQ |
+	    (F_ISSET(dbenv, DB_ENV_DIRECT_LOG) ? DB_OSO_DIRECT : 0))) != 0)
+		__db_err(dbenv,
+		    "DB_ENV->log_put: %s: %s", name, db_strerror(ret));
 
-	__os_freestr(name);
+	__os_free(dbenv, name);
 	return (ret);
 }
 
@@ -582,11 +1096,13 @@ __log_name(dblp, filenumber, namep, fhp, flags)
 	char **namep;
 	DB_FH *fhp;
 {
+	DB_ENV *dbenv;
 	LOG *lp;
 	int ret;
 	char *oname;
 	char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20];
 
+	dbenv = dblp->dbenv;
 	lp = dblp->reginfo.primary;
 
 	/*
@@ -608,13 +1124,12 @@ __log_name(dblp, filenumber, namep, fhp, flags)
 	 * file, return regardless.
 	 */
 	(void)snprintf(new, sizeof(new), LFNAME, filenumber);
-	if ((ret = __db_appname(dblp->dbenv,
-	    DB_APP_LOG, NULL, new, 0, NULL, namep)) != 0 || fhp == NULL)
+	if ((ret = __db_appname(dbenv,
+	    DB_APP_LOG, new, 0, NULL, namep)) != 0 || fhp == NULL)
 		return (ret);
 
 	/* Open the new-style file -- if we succeed, we're done. */
-	if ((ret = __os_open(dblp->dbenv,
-	    *namep, flags, lp->persist.mode, fhp)) == 0)
+	if ((ret = __os_open(dbenv, *namep, flags, lp->persist.mode, fhp)) == 0)
 		return (0);
 
 	/*
@@ -622,15 +1137,14 @@ __log_name(dblp, filenumber, namep, fhp, flags)
 	 * the caller isn't interested in old-style files.
 	 */
 	if (!LF_ISSET(DB_OSO_RDONLY)) {
-		__db_err(dblp->dbenv,
+		__db_err(dbenv,
 		    "%s: log file open failed: %s", *namep, db_strerror(ret));
-		return (__db_panic(dblp->dbenv, ret));
+		return (__db_panic(dbenv, ret));
 	}
 
 	/* Create an old-style file name. */
 	(void)snprintf(old, sizeof(old), LFNAME_V1, filenumber);
-	if ((ret = __db_appname(dblp->dbenv,
-	    DB_APP_LOG, NULL, old, 0, NULL, &oname)) != 0)
+	if ((ret = __db_appname(dbenv, DB_APP_LOG, old, 0, NULL, &oname)) != 0)
 		goto err;
 
 	/*
@@ -638,9 +1152,9 @@ __log_name(dblp, filenumber, namep, fhp, flags)
 	 * space allocated for the new-style name and return the old-style
 	 * name to the caller.
 	 */
-	if ((ret = __os_open(dblp->dbenv,
+	if ((ret = __os_open(dbenv,
 	    oname, flags, lp->persist.mode, fhp)) == 0) {
-		__os_freestr(*namep);
+		__os_free(dbenv, *namep);
 		*namep = oname;
 		return (0);
 	}
@@ -653,52 +1167,82 @@ __log_name(dblp, filenumber, namep, fhp, flags)
 	 * old-style name, but we expected it to exist and we weren't just
 	 * looking for any log file.  That's not a likely error.
 	 */
-err:	__os_freestr(oname);
+err:	__os_free(dbenv, oname);
 	return (ret);
 }
 
-static int
-__log_open_files(dbenv)
+/*
+ * __log_rep_put --
+ *	Short-circuit way for replication clients to put records into the
+ * log.  Replication clients' logs need to be laid out exactly their masters'
+ * are, so we let replication take responsibility for when the log gets
+ * flushed, when log switches files, etc.  This is just a thin PUBLIC wrapper
+ * for __log_putr with a slightly prettier interface.
+ *
+ * Note that the log region mutex should be held when this is called.
+ *
+ * PUBLIC: int __log_rep_put __P((DB_ENV *, DB_LSN *, const DBT *));
+ */
+int
+__log_rep_put(dbenv, lsnp, rec)
 	DB_ENV *dbenv;
+	DB_LSN *lsnp;
+	const DBT *rec;
 {
+	DB_CIPHER *db_cipher;
 	DB_LOG *dblp;
-	DB_LSN r_unused;
-	DBT fid_dbt, t;
-	FNAME *fnp;
+	HDR hdr;
+	DBT *dbt, t;
 	LOG *lp;
-	int ret;
+	int need_free, ret;
 
 	dblp = dbenv->lg_handle;
 	lp = dblp->reginfo.primary;
 
-	for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname);
-	    fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
-		if (fnp->ref == 0)	/* Entry not in use. */
-			continue;
-		if (fnp->name_off != INVALID_ROFF) {
-			memset(&t, 0, sizeof(t));
-			t.data = R_ADDR(&dblp->reginfo, fnp->name_off);
-			t.size = strlen(t.data) + 1;
-		}
-		memset(&fid_dbt, 0, sizeof(fid_dbt));
-		fid_dbt.data = fnp->ufid;
-		fid_dbt.size = DB_FILE_ID_LEN;
-		/*
-		 * Output LOG_CHECKPOINT records which will be
-		 * processed during the OPENFILES pass of recovery.
-		 * At the end of recovery we want to output the
-		 * files that were open so that a future recovery
-		 * run will have the correct files open during
-		 * a backward pass.  For this we output LOG_CLOSE
-		 * records so that the files will be closed on
-		 * the forward pass.
-		 */
-		if ((ret = __log_register_log(dbenv,
-		    NULL, &r_unused, 0,
-		    F_ISSET(dblp, DBLOG_RECOVER) ? LOG_CLOSE : LOG_CHECKPOINT,
-		    fnp->name_off == INVALID_ROFF ? NULL : &t,
-		    &fid_dbt, fnp->id, fnp->s_type, fnp->meta_pgno)) != 0)
+	memset(&hdr, 0, sizeof(HDR));
+	t = *rec;
+	dbt = &t;
+	need_free = 0;
+	db_cipher = (DB_CIPHER *)dbenv->crypto_handle;
+	if (CRYPTO_ON(dbenv))
+		t.size += db_cipher->adj_size(rec->size);
+	if ((ret = __os_calloc(dbenv, 1, t.size, &t.data)) != 0)
+		goto err;
+	need_free = 1;
+	memcpy(t.data, rec->data, rec->size);
+
+	if ((ret = __log_encrypt_record(dbenv, dbt, &hdr, rec->size)) != 0)
+		goto err;
+	__db_chksum(t.data, t.size,
+	    (CRYPTO_ON(dbenv)) ? db_cipher->mac_key : NULL, hdr.chksum);
+
+	DB_ASSERT(log_compare(lsnp, &lp->lsn) == 0);
+	ret = __log_putr(dblp, lsnp, dbt, lp->lsn.offset - lp->len, &hdr);
+err:
+	if (need_free)
+		__os_free(dbenv, t.data);
+	return (ret);
+}
+
+static int
+__log_encrypt_record(dbenv, dbt, hdr, orig)
+	DB_ENV *dbenv;
+	DBT *dbt;
+	HDR *hdr;
+	u_int32_t orig;
+{
+	DB_CIPHER *db_cipher;
+	int ret;
+
+	if (CRYPTO_ON(dbenv)) {
+		db_cipher = (DB_CIPHER *)dbenv->crypto_handle;
+		hdr->size = HDR_CRYPTO_SZ;
+		hdr->orig_size = orig;
+		if ((ret = db_cipher->encrypt(dbenv, db_cipher->data,
+		    hdr->iv, dbt->data, dbt->size)) != 0)
 			return (ret);
+	} else {
+		hdr->size = HDR_NORMAL_SZ;
 	}
 	return (0);
 }
diff --git a/bdb/log/log_rec.c b/bdb/log/log_rec.c
deleted file mode 100644
index 493dd06d4c6..00000000000
--- a/bdb/log/log_rec.c
+++ /dev/null
@@ -1,647 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
- *	Sleepycat Software.  All rights reserved.
- */
-/*
- * Copyright (c) 1995, 1996
- *	The President and Fellows of Harvard University.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include "db_config.h"
-
-#ifndef lint
-static const char revid[] = "$Id: log_rec.c,v 11.48 2001/01/11 18:19:53 bostic Exp $";
-#endif /* not lint */
-
-#ifndef NO_SYSTEM_INCLUDES
-#include <sys/types.h>
-
-#include <string.h>
-#endif
-
-#include "db_int.h"
-#include "db_page.h"
-#include "db_am.h"
-#include "log.h"
-
-static int __log_check_master __P((DB_ENV *, u_int8_t *, char *));
-static int __log_do_open __P((DB_ENV *, DB_LOG *,
-    u_int8_t *, char *, DBTYPE, int32_t, db_pgno_t));
-static int __log_open_file __P((DB_ENV *, DB_LOG *, __log_register_args *));
-
-/*
- * PUBLIC: int __log_register_recover
- * PUBLIC:     __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
- */
-int
-__log_register_recover(dbenv, dbtp, lsnp, op, info)
-	DB_ENV *dbenv;
-	DBT *dbtp;
-	DB_LSN *lsnp;
-	db_recops op;
-	void *info;
-{
-	DB_ENTRY *dbe;
-	DB_LOG *logp;
-	DB *dbp;
-	__log_register_args *argp;
-	int do_rem, ret, t_ret;
-
-	logp = dbenv->lg_handle;
-	dbp = NULL;
-
-#ifdef DEBUG_RECOVER
-	REC_PRINT(__log_register_print);
-#endif
-	COMPQUIET(lsnp, NULL);
-
-	if ((ret = __log_register_read(dbenv, dbtp->data, &argp)) != 0)
-		goto out;
-
-	if ((argp->opcode == LOG_OPEN &&
-	    (DB_REDO(op) || op == DB_TXN_OPENFILES)) ||
-	    (argp->opcode == LOG_CLOSE && DB_UNDO(op))) {
-		/*
-		 * If we are redoing an open or undoing a close, then we need
-		 * to open a file.  We must open the file even if
-		 * the meta page is not yet written as we may be creating it.
-		 */
-		if (op == DB_TXN_OPENFILES)
-			F_SET(logp, DBLOG_FORCE_OPEN);
-		ret = __log_open_file(dbenv, logp, argp);
-		F_CLR(logp, DBLOG_FORCE_OPEN);
-		if (ret == ENOENT || ret == EINVAL) {
-			if (op == DB_TXN_OPENFILES && argp->name.size != 0 &&
-			    (ret = __db_txnlist_delete(dbenv, info,
-				argp->name.data, argp->fileid, 0)) != 0)
-				goto out;
-			ret = 0;
-		}
-	} else if (argp->opcode != LOG_CHECKPOINT) {
-		/*
-		 * If we are undoing an open, then we need to close the file.
-		 *
-		 * If the file is deleted, then we can just ignore this close.
-		 * Otherwise, we should usually have a valid dbp we should
-		 * close or whose reference count should be decremented.
-		 * However, if we shut down without closing a file, we may, in
-		 * fact, not have the file open, and that's OK.
-		 */
-		do_rem = 0;
-		MUTEX_THREAD_LOCK(dbenv, logp->mutexp);
-		if (argp->fileid < logp->dbentry_cnt) {
-			dbe = &logp->dbentry[argp->fileid];
-
-			DB_ASSERT(dbe->refcount == 1);
-
-			ret = __db_txnlist_close(info,
-			    argp->fileid, dbe->count);
-			if ((dbp = TAILQ_FIRST(&dbe->dblist)) != NULL)
-				(void)log_unregister(dbenv, dbp);
-			do_rem = 1;
-		}
-		MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp);
-		if (do_rem) {
-			(void)__log_rem_logid(logp, dbp, argp->fileid);
-			/*
-			 * If remove or rename has closed the file, don't
-			 * sync.
-			 */
-			if (dbp != NULL &&
-			    (t_ret = dbp->close(dbp,
-			    dbp->mpf == NULL ? DB_NOSYNC : 0)) != 0 && ret == 0)
-				ret = t_ret;
-		}
-	} else if (DB_UNDO(op) || op == DB_TXN_OPENFILES) {
-		/*
-		 * It's a checkpoint and we are rolling backward.  It
-		 * is possible that the system was shut down and thus
-		 * ended with a stable checkpoint; this file was never
-		 * closed and has therefore not been reopened yet.  If
-		 * so, we need to try to open it.
-		 */
-		ret = __log_open_file(dbenv, logp, argp);
-		if (ret == ENOENT || ret == EINVAL) {
-			if (argp->name.size != 0 && (ret =
-			    __db_txnlist_delete(dbenv, info,
-				argp->name.data, argp->fileid, 0)) != 0)
-				goto out;
-			ret = 0;
-		}
-	}
-
-out:	if (argp != NULL)
-		__os_free(argp, 0);
-	return (ret);
-}
-
-/*
- * __log_open_file --
- *	Called during log_register recovery.  Make sure that we have an
- *	entry in the dbentry table for this ndx.  Returns 0 on success,
- *	non-zero on error.
- */
-static int
-__log_open_file(dbenv, lp, argp)
-	DB_ENV *dbenv;
-	DB_LOG *lp;
-	__log_register_args *argp;
-{
-	DB_ENTRY *dbe;
-	DB *dbp;
-
-	/*
-	 * We never re-open temporary files.  Temp files are only
-	 * useful during aborts in which case the dbp was entered
-	 * when the file was registered.  During recovery, we treat
-	 * temp files as properly deleted files, allowing the open to
-	 * fail and not reporting any errors when recovery fails to
-	 * get a valid dbp from db_fileid_to_db.
-	 */
-	if (argp->name.size == 0) {
-		(void)__log_add_logid(dbenv, lp, NULL, argp->fileid);
-		return (ENOENT);
-	}
-
-	/*
-	 * Because of reference counting, we cannot automatically close files
-	 * during recovery, so when we're opening, we have to check that the
-	 * name we are opening is what we expect.  If it's not, then we close
-	 * the old file and open the new one.
-	 */
-	MUTEX_THREAD_LOCK(dbenv, lp->mutexp);
-	if (argp->fileid < lp->dbentry_cnt)
-		dbe = &lp->dbentry[argp->fileid];
-	else
-		dbe = NULL;
-
-	if (dbe != NULL) {
-		dbe->deleted = 0;
-		if ((dbp = TAILQ_FIRST(&dbe->dblist)) != NULL) {
-			if (dbp->meta_pgno != argp->meta_pgno ||
-			    memcmp(dbp->fileid,
-			    argp->uid.data, DB_FILE_ID_LEN) != 0) {
-				MUTEX_THREAD_UNLOCK(dbenv, lp->mutexp);
-				goto reopen;
-			}
-			if (!F_ISSET(lp, DBLOG_RECOVER))
-				dbe->refcount++;
-			MUTEX_THREAD_UNLOCK(dbenv, lp->mutexp);
-			return (0);
-		}
-	}
-
-	MUTEX_THREAD_UNLOCK(dbenv, lp->mutexp);
-	if (0) {
-reopen:		(void)log_unregister(dbp->dbenv, dbp);
-		(void)__log_rem_logid(lp, dbp, argp->fileid);
-		dbp->close(dbp, 0);
-	}
-
-	return (__log_do_open(dbenv, lp,
-	    argp->uid.data, argp->name.data,
-	    argp->ftype, argp->fileid, argp->meta_pgno));
-}
-
-/*
- * log_reopen_file -- close and reopen a db file.
- *	Must be called when a metadata page changes.
- *
- * PUBLIC: int __log_reopen_file __P((DB_ENV *,
- * PUBLIC:     char *, int32_t, u_int8_t *, db_pgno_t));
- *
- */
-int
-__log_reopen_file(dbenv, name, ndx, fileid, meta_pgno)
-	DB_ENV *dbenv;
-	char *name;
-	int32_t ndx;
-	u_int8_t *fileid;
-	db_pgno_t meta_pgno;
-{
-	DB *dbp;
-	DB_LOG *logp;
-	DBTYPE ftype;
-	FNAME *fnp;
-	LOG *lp;
-	char *tmp_name;
-	int ret;
-
-	logp = dbenv->lg_handle;
-
-	if (name == NULL) {
-		R_LOCK(dbenv, &logp->reginfo);
-
-		lp = logp->reginfo.primary;
-
-		for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname);
-		    fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
-			if (fnp->ref == 0)      /* Entry not in use. */
-				continue;
-			if (memcmp(fnp->ufid, fileid, DB_FILE_ID_LEN) == 0)
-				break;
-		}
-
-		if (fnp == 0 || fnp->name_off == INVALID_ROFF) {
-			__db_err(dbenv,
-			    "metasub recover: non-existent file id");
-			return (EINVAL);
-		}
-
-		name = R_ADDR(&logp->reginfo, fnp->name_off);
-		ret = __os_strdup(dbenv, name, &tmp_name);
-		R_UNLOCK(dbenv, &logp->reginfo);
-		if (ret != 0)
-			goto out;
-		name = tmp_name;
-	} else
-		tmp_name = NULL;
-
-	if ((ret = __db_fileid_to_db(dbenv, &dbp, ndx, 0)) != 0)
-		goto out;
-	ftype = dbp->type;
-	(void)log_unregister(dbenv, dbp);
-	(void)__log_rem_logid(logp, dbp, ndx);
-	(void)dbp->close(dbp, 0);
-
-	ret = __log_do_open(dbenv, logp, fileid, name, ftype, ndx, meta_pgno);
-
-	if (tmp_name != NULL)
-		__os_free(tmp_name, 0);
-
-out:	return (ret);
-}
-
-/*
- * __log_do_open --
- *	Open files referenced in the log.  This is the part of the open that
- * is not protected by the thread mutex.
- */
-static int
-__log_do_open(dbenv, lp, uid, name, ftype, ndx, meta_pgno)
-	DB_ENV *dbenv;
-	DB_LOG *lp;
-	u_int8_t *uid;
-	char *name;
-	DBTYPE ftype;
-	int32_t ndx;
-	db_pgno_t meta_pgno;
-{
-	DB *dbp;
-	int ret;
-	u_int8_t zeroid[DB_FILE_ID_LEN];
-
-	if ((ret = db_create(&dbp, lp->dbenv, 0)) != 0)
-		return (ret);
-
-	dbp->log_fileid = ndx;
-
-	/*
-	 * This is needed to signal to the locking routines called while
-	 * opening databases that we are potentially undoing a transaction
-	 * from an XA process.  Since the XA process does not share
-	 * locks with the aborting transaction this prevents us from
-	 * deadlocking during the open during rollback.
-	 * Because this routine is called either during recovery or during an
-	 * XA_ABORT, we can safely set DB_AM_RECOVER in the dbp since it
-	 * will not be shared with other threads.
-	 */
-	F_SET(dbp, DB_AM_RECOVER);
-	if (meta_pgno != PGNO_BASE_MD)
-		memcpy(dbp->fileid, uid, DB_FILE_ID_LEN);
-	dbp->type = ftype;
-	if ((ret =
-	    __db_dbopen(dbp, name, 0, __db_omode("rw----"), meta_pgno)) == 0) {
-		/*
-		 * Verify that we are opening the same file that we were
-		 * referring to when we wrote this log record.
-		 */
-		if (meta_pgno != PGNO_BASE_MD &&
-		    __log_check_master(dbenv, uid, name) != 0)
-			goto not_right;
-		if (memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0) {
-			memset(zeroid, 0, DB_FILE_ID_LEN);
-			if (memcmp(dbp->fileid, zeroid, DB_FILE_ID_LEN) != 0)
-				goto not_right;
-			memcpy(dbp->fileid, uid, DB_FILE_ID_LEN);
-		}
-		if (IS_RECOVERING(dbenv)) {
-			(void)log_register(dbp->dbenv, dbp, name);
-			(void)__log_add_logid(dbenv, lp, dbp, ndx);
-		}
-		return (0);
-	}
-
-not_right:
-	(void)dbp->close(dbp, 0);
-	(void)__log_add_logid(dbenv, lp, NULL, ndx);
-
-	return (ENOENT);
-}
-
-static int
-__log_check_master(dbenv, uid, name)
-	DB_ENV *dbenv;
-	u_int8_t *uid;
-	char *name;
-{
-	DB *dbp;
-	int ret;
-
-	ret = 0;
-	if ((ret = db_create(&dbp, dbenv, 0)) != 0)
-		return (ret);
-	dbp->type = DB_BTREE;
-	ret = __db_dbopen(dbp, name, 0, __db_omode("rw----"), PGNO_BASE_MD);
-
-	if (ret == 0 && memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0) 
-		ret = EINVAL;
-	
-	(void) dbp->close(dbp, 0);
-	return (ret);
-}
-
-/*
- * __log_add_logid --
- *	Adds a DB entry to the log's DB entry table.
- *
- * PUBLIC: int __log_add_logid __P((DB_ENV *, DB_LOG *, DB *, int32_t));
- */
-int
-__log_add_logid(dbenv, logp, dbp, ndx)
-	DB_ENV *dbenv;
-	DB_LOG *logp;
-	DB *dbp;
-	int32_t ndx;
-{
-	DB *dbtmp;
-	int32_t i;
-	int ret;
-
-	ret = 0;
-
-	MUTEX_THREAD_LOCK(dbenv, logp->mutexp);
-
-	/*
-	 * Check if we need to grow the table.  Note, ndx is 0-based (the
-	 * index into the DB entry table) an dbentry_cnt is 1-based, the
-	 * number of available slots.
-	 */
-	if (logp->dbentry_cnt <= ndx) {
-		if ((ret = __os_realloc(dbenv,
-		    (ndx + DB_GROW_SIZE) * sizeof(DB_ENTRY),
-		    NULL, &logp->dbentry)) != 0)
-			goto err;
-
-		/*
-		 * We have moved the head of the queue.
-		 * Fix up the queue header of an empty queue or the previous
-		 * pointer of the first element.
-		 */
-		for (i = 0; i < logp->dbentry_cnt; i++) {
-			if ((dbtmp =
-			    TAILQ_FIRST(&logp->dbentry[i].dblist)) == NULL)
-				TAILQ_INIT(&logp->dbentry[i].dblist);
-			else
-				TAILQ_REINSERT_HEAD(
-				    &logp->dbentry[i].dblist, dbtmp, links);
-		}
-
-		/* Initialize the new entries. */
-		for (i = logp->dbentry_cnt; i < ndx + DB_GROW_SIZE; i++) {
-			logp->dbentry[i].count = 0;
-			TAILQ_INIT(&logp->dbentry[i].dblist);
-			logp->dbentry[i].deleted = 0;
-			logp->dbentry[i].refcount = 0;
-		}
-
-		logp->dbentry_cnt = i;
-	}
-
-	if (logp->dbentry[ndx].deleted == 0 &&
-	    TAILQ_FIRST(&logp->dbentry[ndx].dblist) == NULL) {
-		logp->dbentry[ndx].count = 0;
-		if (dbp != NULL)
-			TAILQ_INSERT_HEAD(&logp->dbentry[ndx].dblist,
-			    dbp, links);
-		logp->dbentry[ndx].deleted = dbp == NULL;
-		logp->dbentry[ndx].refcount = 1;
-	} else if (!F_ISSET(logp, DBLOG_RECOVER)) {
-		if (dbp != NULL)
-			TAILQ_INSERT_HEAD(&logp->dbentry[ndx].dblist,
-			    dbp, links);
-		logp->dbentry[ndx].refcount++;
-	}
-
-err:	MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp);
-	return (ret);
-}
-
-/*
- * __db_fileid_to_db --
- *	Return the DB corresponding to the specified fileid.
- *
- * PUBLIC: int __db_fileid_to_db __P((DB_ENV *, DB **, int32_t, int));
- */
-int
-__db_fileid_to_db(dbenv, dbpp, ndx, inc)
-	DB_ENV *dbenv;
-	DB **dbpp;
-	int32_t ndx;
-	int inc;
-{
-	DB_LOG *logp;
-	DB *dbp;
-	FNAME *fname;
-	int ret;
-	char *name;
-
-	ret = 0;
-	logp = dbenv->lg_handle;
-
-	MUTEX_THREAD_LOCK(dbenv, logp->mutexp);
-
-	/*
-	 * Under XA, a process different than the one issuing DB operations
-	 * may abort a transaction.  In this case, recovery routines are run
-	 * by a process that does not necessarily have the file open, so we
-	 * we must open the file explicitly.
-	 */
-	if (ndx >= logp->dbentry_cnt ||
-	    (!logp->dbentry[ndx].deleted &&
-	    (dbp = TAILQ_FIRST(&logp->dbentry[ndx].dblist)) == NULL)) {
-		if (F_ISSET(logp, DBLOG_RECOVER)) {
-			ret = ENOENT;
-			goto err;
-		}
-		if (__log_lid_to_fname(logp, ndx, &fname) != 0) {
-			/* Couldn't find entry; this is a fatal error. */
-			__db_err(dbenv, "Missing log fileid entry");
-			ret = EINVAL;
-			goto err;
-		}
-		name = R_ADDR(&logp->reginfo, fname->name_off);
-
-		/*
-		 * __log_do_open is called without protection of the
-		 * log thread lock.
-		 */
-		MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp);
-
-		/*
-		 * At this point, we are not holding the thread lock, so exit
-		 * directly instead of going through the exit code at the
-		 * bottom.  If the __log_do_open succeeded, then we don't need
-		 * to do any of the remaining error checking at the end of this
-		 * routine.
-		 */
-		if ((ret = __log_do_open(dbenv, logp,
-		    fname->ufid, name, fname->s_type,
-		    ndx, fname->meta_pgno)) != 0)
-			return (ret);
-
-		*dbpp = TAILQ_FIRST(&logp->dbentry[ndx].dblist);
-		return (0);
-	}
-
-	/*
-	 * Return DB_DELETED if the file has been deleted (it's not an error).
-	 */
-	if (logp->dbentry[ndx].deleted) {
-		ret = DB_DELETED;
-		if (inc)
-			logp->dbentry[ndx].count++;
-		goto err;
-	}
-
-	/*
-	 * Otherwise return 0, but if we don't have a corresponding DB, it's
-	 * an error.
-	 */
-	if ((*dbpp = TAILQ_FIRST(&logp->dbentry[ndx].dblist)) == NULL)
-		ret = ENOENT;
-
-err:	MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp);
-	return (ret);
-}
-
-/*
- * __log_close_files --
- *	Close files that were opened by the recovery daemon.  We sync the
- *	file, unless its mpf pointer has been NULLed by a db_remove or
- *	db_rename.  We may not have flushed the log_register record that
- *	closes the file.
- *
- * PUBLIC: void __log_close_files __P((DB_ENV *));
- */
-void
-__log_close_files(dbenv)
-	DB_ENV *dbenv;
-{
-	DB_ENTRY *dbe;
-	DB_LOG *logp;
-	DB *dbp;
-	int32_t i;
-
-	logp = dbenv->lg_handle;
-	MUTEX_THREAD_LOCK(dbenv, logp->mutexp);
-	for (i = 0; i < logp->dbentry_cnt; i++) {
-		dbe = &logp->dbentry[i];
-		while ((dbp = TAILQ_FIRST(&dbe->dblist)) != NULL) {
-			(void)log_unregister(dbenv, dbp);
-			TAILQ_REMOVE(&dbe->dblist, dbp, links);
-			(void)dbp->close(dbp, dbp->mpf == NULL ? DB_NOSYNC : 0);
-		}
-		dbe->deleted = 0;
-		dbe->refcount = 0;
-	}
-	MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp);
-}
-
-/*
- * __log_rem_logid
- *	Remove an entry from the log table.  Find the appropriate DB and
- * unlink it from the linked list off the table.  If the DB is NULL, treat
- * this as a simple refcount decrement.
- *
- * PUBLIC: void __log_rem_logid __P((DB_LOG *, DB *, int32_t));
- */
-void
-__log_rem_logid(logp, dbp, ndx)
-	DB_LOG *logp;
-	DB *dbp;
-	int32_t ndx;
-{
-	DB *xdbp;
-
-	MUTEX_THREAD_LOCK(logp->dbenv, logp->mutexp);
-	if (--logp->dbentry[ndx].refcount == 0) {
-		TAILQ_INIT(&logp->dbentry[ndx].dblist);
-		logp->dbentry[ndx].deleted = 0;
-	} else if (dbp != NULL)
-		for (xdbp = TAILQ_FIRST(&logp->dbentry[ndx].dblist);
-		    xdbp != NULL;
-		    xdbp = TAILQ_NEXT(xdbp, links))
-			if (xdbp == dbp) {
-				TAILQ_REMOVE(&logp->dbentry[ndx].dblist,
-				    xdbp, links);
-				break;
-			}
-
-	MUTEX_THREAD_UNLOCK(logp->dbenv, logp->mutexp);
-}
-
-/*
- * __log_lid_to_fname --
- *	Traverse the shared-memory region looking for the entry that
- *	matches the passed log fileid.  Returns 0 on success; -1 on error.
- * PUBLIC: int __log_lid_to_fname __P((DB_LOG *, int32_t, FNAME **));
- */
-int
-__log_lid_to_fname(dblp, lid, fnamep)
-	DB_LOG *dblp;
-	int32_t lid;
-	FNAME **fnamep;
-{
-	FNAME *fnp;
-	LOG *lp;
-
-	lp = dblp->reginfo.primary;
-
-	for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname);
-	    fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
-		if (fnp->ref == 0)	/* Entry not in use. */
-			continue;
-		if (fnp->id == lid) {
-			*fnamep = fnp;
-			return (0);
-		}
-	}
-	return (-1);
-}
diff --git a/bdb/log/log_register.c b/bdb/log/log_register.c
deleted file mode 100644
index 1e0e523d8b9..00000000000
--- a/bdb/log/log_register.c
+++ /dev/null
@@ -1,433 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
- *	Sleepycat Software.  All rights reserved.
- */
-#include "db_config.h"
-
-#ifndef lint
-static const char revid[] = "$Id: log_register.c,v 11.35 2001/01/10 16:04:19 bostic Exp $";
-#endif /* not lint */
-
-#ifndef NO_SYSTEM_INCLUDES
-#include <sys/types.h>
-
-#include <string.h>
-#endif
-
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
-#include "db_int.h"
-#include "log.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
-
-/*
- * log_register --
- *	Register a file name.
- */
-int
-log_register(dbenv, dbp, name)
-	DB_ENV *dbenv;
-	DB *dbp;
-	const char *name;
-{
-	DBT fid_dbt, r_name;
-	DB_LOG *dblp;
-	DB_LSN r_unused;
-	FNAME *found_fnp, *fnp, *recover_fnp, *reuse_fnp;
-	LOG *lp;
-	size_t len;
-	int32_t maxid;
-	int inserted, ok, ret;
-	void *namep;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_log_register(dbenv, dbp, name));
-#endif
-
-	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
-
-	dblp = dbenv->lg_handle;
-	lp = dblp->reginfo.primary;
-	fnp = reuse_fnp = NULL;
-	inserted = ret = 0;
-	namep = NULL;
-
-	/* Check the arguments. */
-	if (dbp->type != DB_BTREE && dbp->type != DB_QUEUE &&
-	    dbp->type != DB_HASH && dbp->type != DB_RECNO) {
-		__db_err(dbenv, "log_register: unknown DB file type");
-		return (EINVAL);
-	}
-
-	R_LOCK(dbenv, &dblp->reginfo);
-
-	/*
-	 * See if we've already got this file in the log, finding the
-	 * (maximum+1) in-use file id and some available file id (if we
-	 * find an available fid, we'll use it, else we'll have to allocate
-	 * one after the maximum that we found).
-	 */
-	ok = 0;
-	found_fnp = recover_fnp = NULL;
-	for (maxid = 0, fnp = SH_TAILQ_FIRST(&lp->fq, __fname);
-	    fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
-		if (F_ISSET(dblp, DBLOG_RECOVER) && fnp->id == dbp->log_fileid)
-			recover_fnp = fnp;
-		if (fnp->ref == 0) {		/* Entry is not in use. */
-			if (reuse_fnp == NULL)
-				reuse_fnp = fnp;
-			continue;
-		}
-		if (memcmp(dbp->fileid, fnp->ufid, DB_FILE_ID_LEN) == 0) {
-			if (fnp->meta_pgno == 0) {
-				if (fnp->locked == 1) {
-					__db_err(dbenv, "File is locked");
-					return (EINVAL);
-				}
-				if (found_fnp != NULL) {
-					fnp = found_fnp;
-					goto found;
-				}
-				ok = 1;
-			}
-			if (dbp->meta_pgno == fnp->meta_pgno) {
-				if (F_ISSET(dblp, DBLOG_RECOVER)) {
-					if (fnp->id != dbp->log_fileid) {
-						/*
-						 * If we are in recovery, there
-						 * is only one dbp on the list.
-						 * If the refcount goes to 0,
-						 * we will clear the list.  If
-						 * it doesn't, we want to leave
-						 * the dbp where it is, so
-						 * passing a NULL to rem_logid
-						 * is correct.
-						 */
-						__log_rem_logid(dblp,
-						    NULL, fnp->id);
-						if (recover_fnp != NULL)
-							break;
-						continue;
-					}
-					fnp->ref = 1;
-					goto found;
-				}
-				++fnp->ref;
-				if (ok)
-					goto found;
-				found_fnp = fnp;
-			}
-		}
-		if (maxid <= fnp->id)
-			maxid = fnp->id + 1;
-	}
-	if ((fnp = found_fnp) != NULL)
-		goto found;
-
-	/* Fill in fnp structure. */
-	if (recover_fnp != NULL)	/* This has the right number */
-		fnp = recover_fnp;
-	else if (reuse_fnp != NULL)	/* Reuse existing one. */
-		fnp = reuse_fnp;
-	else {				/* Allocate a new one. */
-		if ((ret = __db_shalloc(dblp->reginfo.addr,
-		    sizeof(FNAME), 0, &fnp)) != 0)
-			goto mem_err;
-		fnp->id = maxid;
-	}
-
-	if (F_ISSET(dblp, DBLOG_RECOVER))
-		fnp->id = dbp->log_fileid;
-
-	fnp->ref = 1;
-	fnp->locked = 0;
-	fnp->s_type = dbp->type;
-	memcpy(fnp->ufid, dbp->fileid, DB_FILE_ID_LEN);
-	fnp->meta_pgno = dbp->meta_pgno;
-
-	if (name != NULL) {
-		len = strlen(name) + 1;
-		if ((ret =
-		    __db_shalloc(dblp->reginfo.addr, len, 0, &namep)) != 0) {
-mem_err:		__db_err(dbenv,
-			    "Unable to allocate memory to register %s", name);
-			goto err;
-	}
-		fnp->name_off = R_OFFSET(&dblp->reginfo, namep);
-		memcpy(namep, name, len);
-	} else
-		fnp->name_off = INVALID_ROFF;
-
-	/* Only do the insert if we allocated a new fnp. */
-	if (reuse_fnp == NULL && recover_fnp == NULL)
-		SH_TAILQ_INSERT_HEAD(&lp->fq, fnp, q, __fname);
-	inserted = 1;
-
-	/* Log the registry. */
-	if (!F_ISSET(dblp, DBLOG_RECOVER)) {
-		/*
-		 * We allow logging on in-memory databases, so the name here
-		 * could be NULL.
-		 */
-		if (name != NULL) {
-			r_name.data = (void *)name;
-			r_name.size = strlen(name) + 1;
-		}
-		memset(&fid_dbt, 0, sizeof(fid_dbt));
-		fid_dbt.data = dbp->fileid;
-		fid_dbt.size = DB_FILE_ID_LEN;
-		if ((ret = __log_register_log(dbenv, NULL, &r_unused,
-		    0, LOG_OPEN, name == NULL ? NULL : &r_name,
-		    &fid_dbt, fnp->id, dbp->type, dbp->meta_pgno)) != 0)
-			goto err;
-	}
-
-found:	/*
-	 * If we found the entry in the shared area, then the file is
-	 * already open, so there is no need to log the open.  We only
-	 * log the open and closes on the first open and last close.
-	 */
-	if (!F_ISSET(dblp, DBLOG_RECOVER) &&
-	    (ret = __log_add_logid(dbenv, dblp, dbp, fnp->id)) != 0)
-			goto err;
-
-	if (!F_ISSET(dblp, DBLOG_RECOVER))
-		dbp->log_fileid = fnp->id;
-
-	if (0) {
-err:		if (inserted)
-			SH_TAILQ_REMOVE(&lp->fq, fnp, q, __fname);
-		if (namep != NULL)
-			__db_shalloc_free(dblp->reginfo.addr, namep);
-		if (fnp != NULL)
-			__db_shalloc_free(dblp->reginfo.addr, fnp);
-	}
-
-	R_UNLOCK(dbenv, &dblp->reginfo);
-
-	return (ret);
-}
-
-/*
- * log_unregister --
- *	Discard a registered file name.
- */
-int
-log_unregister(dbenv, dbp)
-	DB_ENV *dbenv;
-	DB *dbp;
-{
-	int ret;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_log_unregister(dbenv, dbp));
-#endif
-
-	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
-
-	ret = __log_filelist_update(dbenv, dbp, dbp->log_fileid, NULL, NULL);
-	dbp->log_fileid = DB_LOGFILEID_INVALID;
-	return (ret);
-}
-
-/*
- * PUBLIC: int __log_filelist_update
- * PUBLIC:    __P((DB_ENV *, DB *, int32_t, const char *, int *));
- *
- *  Utility player for updating and logging the file list.  Called
- *  for 3 reasons:
- *	1) mark file closed: newname == NULL.
- *	2) change filename: newname != NULL.
- *	3) from recovery to verify & change filename if necessary, set != NULL.
- */
-int
-__log_filelist_update(dbenv, dbp, fid, newname, set)
-	DB_ENV *dbenv;
-	DB *dbp;
-	int32_t fid;
-	const char *newname;
-	int *set;
-{
-	DBT fid_dbt, r_name;
-	DB_LOG *dblp;
-	DB_LSN r_unused;
-	FNAME *fnp;
-	LOG *lp;
-	u_int32_t len, newlen;
-	int ret;
-	void *namep;
-
-	ret = 0;
-	dblp = dbenv->lg_handle;
-	lp = dblp->reginfo.primary;
-
-	R_LOCK(dbenv, &dblp->reginfo);
-
-	/* Find the entry in the log. */
-	for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname);
-	    fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname))
-		if (fid == fnp->id)
-			break;
-	if (fnp == NULL) {
-		__db_err(dbenv, "log_unregister: non-existent file id");
-		ret = EINVAL;
-		goto ret1;
-	}
-
-	/*
-	 * Log the unregistry only if this is the last one and we are
-	 * really closing the file or if this is an abort of a created
-	 * file and we need to make sure there is a record in the log.
-	 */
-	namep = NULL;
-	len = 0;
-	if (fnp->name_off != INVALID_ROFF) {
-		namep = R_ADDR(&dblp->reginfo, fnp->name_off);
-		len = strlen(namep) + 1;
-	}
-	if (!F_ISSET(dblp, DBLOG_RECOVER) && fnp->ref == 1) {
-		if (namep != NULL) {
-			memset(&r_name, 0, sizeof(r_name));
-			r_name.data = namep;
-			r_name.size = len;
-		}
-		memset(&fid_dbt, 0, sizeof(fid_dbt));
-		fid_dbt.data = fnp->ufid;
-		fid_dbt.size = DB_FILE_ID_LEN;
-		if ((ret = __log_register_log(dbenv, NULL, &r_unused,
-		    0, LOG_CLOSE,
-		    fnp->name_off == INVALID_ROFF ? NULL : &r_name,
-		    &fid_dbt, fid, fnp->s_type, fnp->meta_pgno))
-		    != 0)
-			goto ret1;
-	}
-
-	/*
-	 * If we are changing the name we must log this fact.
-	 */
-	if (newname != NULL) {
-		DB_ASSERT(fnp->ref == 1);
-		newlen = strlen(newname) + 1;
-		if (!F_ISSET(dblp, DBLOG_RECOVER)) {
-			r_name.data = (void *) newname;
-			r_name.size = newlen;
-			if ((ret = __log_register_log(dbenv,
-			    NULL, &r_unused, 0, LOG_OPEN, &r_name, &fid_dbt,
-			    fnp->id, fnp->s_type, fnp->meta_pgno)) != 0)
-				goto ret1;
-		}
-
-		/*
-		 * Check to see if the name is already correct.
-		 */
-		if (set != NULL) {
-			if (len != newlen || memcmp(namep, newname, len) != 0)
-				*set = 1;
-			else {
-				*set = 0;
-				goto ret1;
-			}
-		}
-
-		/*
-		 * Change the name, realloc memory if necessary
-		 */
-		if (len < newlen) {
-			__db_shalloc_free(dblp->reginfo.addr,
-			    R_ADDR(&dblp->reginfo, fnp->name_off));
-			if ((ret = __db_shalloc(
-			    dblp->reginfo.addr, newlen, 0, &namep)) != 0) {
-				__db_err(dbenv,
-				    "Unable to allocate memory to register %s",
-				    newname);
-				goto ret1;
-			}
-			fnp->name_off = R_OFFSET(&dblp->reginfo, namep);
-		} else
-			namep = R_ADDR(&dblp->reginfo, fnp->name_off);
-		memcpy(namep, newname, newlen);
-	} else {
-
-		/*
-		 * If more than 1 reference, just decrement the reference
-		 * and return.  Otherwise, free the name if one exists.
-		 */
-		DB_ASSERT(fnp->ref >= 1);
-		--fnp->ref;
-		if (fnp->ref == 0) {
-			if (fnp->name_off != INVALID_ROFF)
-				__db_shalloc_free(dblp->reginfo.addr,
-				    R_ADDR(&dblp->reginfo, fnp->name_off));
-			fnp->name_off = INVALID_ROFF;
-		}
-
-		/*
-		 * Remove from the process local table.  If this
-		 * operation is taking place during recovery, then
-		 * the logid was never added to the table, so do not remove it.
-		 */
-		if (!F_ISSET(dblp, DBLOG_RECOVER))
-			__log_rem_logid(dblp, dbp, fid);
-	}
-
-ret1:	R_UNLOCK(dbenv, &dblp->reginfo);
-	return (ret);
-}
-
-/*
- * __log_file_lock -- lock a file for single access
- *	This only works if logging is on.
- *
- * PUBLIC: int __log_file_lock __P((DB *));
- */
-int
-__log_file_lock(dbp)
-	DB *dbp;
-{
-	DB_ENV *dbenv;
-	DB_LOG *dblp;
-	FNAME *fnp;
-	LOG *lp;
-	int ret;
-
-	dbenv = dbp->dbenv;
-	dblp = dbenv->lg_handle;
-	lp = dblp->reginfo.primary;
-
-	ret = 0;
-	R_LOCK(dbenv, &dblp->reginfo);
-
-	for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname);
-	    fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
-		if (fnp->ref == 0)
-			continue;
-
-		if (!memcmp(dbp->fileid, fnp->ufid, DB_FILE_ID_LEN)) {
-			if (fnp->meta_pgno == 0) {
-				if (fnp->ref != 1)
-					goto err;
-
-				fnp->locked = 1;
-			} else {
-err:				__db_err(dbp->dbenv, "File is open");
-				ret = EINVAL;
-				goto done;
-			}
-
-		}
-	}
-done:	R_UNLOCK(dbenv, &dblp->reginfo);
-	return (ret);
-}